In [62]:
%pip install 'databricks-sdk==0.61.0' 'pyarrow<20' 'databricks-sdk[notebook]' 'databricks-agents==1.2.0' 'mlflow<=3.1' 'mlflow[databricks]' 'databricks-vectorsearch==0.57' 'langchain' 'langchain_core' 'databricks-langchain' 'bs4' 'markdownify' 'dotenv'
import os
if os.environ.get("DATABRICKS_RUNTIME_VERSION"):
    dbutils.library.restartPython()

Note: you may need to restart the kernel to use updated packages.


In [63]:
dbutils.widgets.text("vector_search_endpoint", "one-env-shared-endpoint-7")
dbutils.widgets.text("vector_search_index", "tanner_wendland.default.chat_history_index")
dbutils.widgets.text("llm_model_serving_endpoint_name", "databricks-claude-3-7-sonnet")
dbutils.widgets.text("target_catalog", "tanner_wendland")
dbutils.widgets.text("target_schema", "default")

Box(children=(Label(value='vector_search_endpoint'), Text(value='one-env-shared-endpoint-7')))

Box(children=(Label(value='vector_search_index'), Text(value='tanner_wendland.default.chat_history_index')))

Box(children=(Label(value='llm_model_serving_endpoint_name'), Text(value='databricks-claude-3-7-sonnet')))

Box(children=(Label(value='target_catalog'), Text(value='tanner_wendland')))

Box(children=(Label(value='target_schema'), Text(value='default')))

In [64]:
vector_search_endpoint = dbutils.widgets.get("vector_search_endpoint")
vector_search_index = dbutils.widgets.get("vector_search_index")
llm_model_serving_endpoint_name = dbutils.widgets.get("llm_model_serving_endpoint_name")
target_catalog = dbutils.widgets.get("target_catalog")
target_schema = dbutils.widgets.get("target_schema")

In [65]:
from databricks.vector_search.client import VectorSearchClient
import os

vsc: VectorSearchClient = None
if os.environ.get("DATABRICKS_RUNTIME_VERSION"):
    vsc = VectorSearchClient(disable_notice=True)
else:
    import dotenv
    dotenv.load_dotenv('.env')
    vsc = VectorSearchClient(disable_notice=True, workspace_url=os.environ.get("DATABRICKS_HOST"), personal_access_token=os.environ.get("DATABRICKS_TOKEN"))

In [66]:
question = "Tell me about the chat history"

results = vsc.get_index(vector_search_endpoint, vector_search_index).similarity_search(
  query_text=question,
  columns=["message_content"],
  num_results=1)
docs = results.get('result', {}).get('data_array', [])
docs

[NOTICE] Using a Personal Authentication Token (PAT). Recommended for development only. For improved performance, please use Service Principal based authentication. To disable this message, pass disable_notice=True.


[["Hello! I'm starting a new chat history!", 0.004474774]]

In [67]:
import mlflow
mlflow.set_registry_uri("databricks-uc")

In [None]:
### Chain Config

In [101]:
chain_config = {
    "llm_model_serving_endpoint_name": llm_model_serving_endpoint_name,
    "vector_search_endpoint_name": vector_search_endpoint,  # the endoint we want to use for vector search
    "vector_search_index": vector_search_index,
    "llm_prompt_template": """You are an assistant that answers questions. You have access to a vector search tool that searches previous conversations, but you should primarily use the immediate conversation context provided.

Current conversation context: {context}

Instructions:
1. First, use the current conversation context above to answer questions
2. Only use the vector search tool if the question explicitly refers to information from previous conversations that is NOT in the current context
3. If the current conversation context contains sufficient information to answer the question, do NOT use the vector search tool
4. The vector search tool should be used sparingly, only when the user is clearly asking about something from their chat history that isn't in the current conversation""",
}

### Vector Search Config

In [69]:
retriever_config = {
    "parameters": {
        "k": 3,
        "query_type": "hybrid"
    }
}

In [93]:
from typing import Dict

# combine dynamic and static filters for vector search
def create_configurable_with_filters(input: Dict, retriever_config: Dict) -> Dict:
   """
   create configurable object with filters.
   Args:
       input: The input data containing filters.
   Returns:
       A configurable object with filters added to the search_kwargs.
   """
   if "custom_inputs" in input:
       filters = input["custom_inputs"]["filters"]
   else:
       filters = {}
   print(filters)
   configurable = {
       "configurable": {
           "search_kwargs": {
               "k": retriever_config.get("parameters")["k"],
               "query_type": retriever_config.get("parameters").get("query_type"),
               "filter": filters
           }
       }
   }
   return configurable

## Testing Basic Chain

In [94]:
from databricks.vector_search.client import VectorSearchClient
from databricks_langchain import DatabricksVectorSearch
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables.utils import ConfigurableField

## Load the chain's configuration
model_config = mlflow.models.ModelConfig(development_config=chain_config)

## Turn the Vector Search index into a LangChain retriever
vector_search_as_retriever = DatabricksVectorSearch(
    endpoint=model_config.get("vector_search_endpoint_name"),
    index_name=model_config.get("vector_search_index"),
    columns=["id", "message_content", "user_name"],
).as_retriever(search_kwargs=retriever_config.get("parameters"))

configurable_vs_retriever = vector_search_as_retriever.configurable_fields(
   search_kwargs=ConfigurableField(
       id="search_kwargs",
       name="Search Kwargs",
       description="The search kwargs to use",
   )
)


# Method to format the docs returned by the retriever into the prompt (keep only the text from chunks)
def format_context(docs):
    chunk_contents = [f"Passage: {d.page_content}\n" for d in docs]
    return "".join(chunk_contents)

#Let's try our retriever chain:
relevant_docs = (configurable_vs_retriever | RunnableLambda(format_context)| StrOutputParser()).invoke('What was my chat history idea?')

print(relevant_docs)

[NOTICE] Using a notebook authentication token. Recommended for development only. For improved performance, please use Service Principal based authentication. To disable this message, pass disable_notice=True.
Passage: Based on the chat history provided, the first message you sent was:

"I'm looking for a new laptop for work. I need something with good performance for running multiple applications simultaneously, a decent battery life, and preferably under $1500. Any recommendations?"

This was your initial query that started our conversation about laptop recommendations.
Passage: Hello! I'm starting a new chat history!
Passage: From the messages you've given what was the first message I sent?



## Real Chain

In [110]:
from langchain_core.prompts import ChatPromptTemplate
from databricks_langchain.chat_models import ChatDatabricks
from langchain.tools import Tool
from langchain.agents import create_tool_calling_agent, AgentExecutor
from operator import itemgetter

def vector_search_with_filters(query: str, input_data: dict = None) -> str:
    """
    Search the vector database for relevant documents.
    
    Args:
        query: The search query string
        input_data: Optional input data containing filters
    
    Returns:
        Formatted context from retrieved documents
    """
    if input_data is None:
        input_data = {}
    
    # Apply filters securely
    config = create_configurable_with_filters(input_data, retriever_config)
    
    # Retrieve documents
    docs = configurable_vs_retriever.invoke(query, config=config)
    
    # Format and return context
    return format_context(docs)

vector_search_tool = Tool(
    name="search_chat_history",
    description="Retreive chat history from this vector search index for the current user, use this tool to answer questions that may refer to previous conversations but only if we can't answer the question with the immediate conversation context provided.",
    func=lambda query: vector_search_with_filters(query, {}),  # Default case without filters
)

# Updated prompt template for tool-calling agent
prompt = ChatPromptTemplate.from_messages([
    ("system", model_config.get('llm_prompt_template')),
    ("user", "{question}"),
    ("placeholder", "{agent_scratchpad}"),
])

# Our foundation model answering the final prompt
model = ChatDatabricks(
    endpoint=model_config.get("llm_model_serving_endpoint_name"),
    extra_params={"temperature": 0.01, "max_tokens": 500}
)

agent = create_tool_calling_agent(model, [vector_search_tool], prompt)
agent_executor = AgentExecutor(agent=agent, tools=[vector_search_tool], verbose=True)

def extract_user_query_string(chat_messages_array):
    return chat_messages_array[-1]["content"]

def extract_context_string(chat_messages_array):
    return '\n'.join([f"Role: {message['role']} - Content: {message['content']}" for message in chat_messages_array[:-1]])

###########
# RAG Chain with Tool
############
def rag_chain_with_tool(input_data: dict) -> str:
    """
    RAG chain that uses vector search as a tool with proper filtering.
    
    Args:
        input_data: Input containing messages and optional custom_inputs with filters
    
    Returns:
        Generated response
    """
    # Extract user query
    user_query = extract_user_query_string(input_data["messages"])
    
    # Create a modified vector search tool that includes the filters from input_data
    def filtered_vector_search(query: str) -> str:
        return vector_search_with_filters(query, input_data)
    
    # Update the tool with the filtered version
    filtered_tool = Tool(
        name="search_chat_history",
        description="Retreive chat history from this vector search index for the current user, use this tool to answer questions that may refer to previous conversations",
        func=filtered_vector_search,
    )
    
    # Create new agent executor with the filtered tool
    filtered_agent = create_tool_calling_agent(model, [filtered_tool], prompt)
    filtered_agent_executor = AgentExecutor(agent=filtered_agent, tools=[filtered_tool], verbose=True)

    context = extract_context_string(input_data["messages"])
    print(context)
    # Execute the agent
    result = filtered_agent_executor.invoke({"question": user_query, "context": context})
    return result["output"]

# Create a runnable version of the chain
chain = RunnableLambda(rag_chain_with_tool)

### Search WITHOUT Chat Filtering

In [104]:
input_example_no_filter = {"messages": [ {"role": "user", "content": "What was my chat history idea?"}]}
answer_no_filter = chain.invoke(input_example_no_filter)
print(answer_no_filter)




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `search_chat_history` with `idea`
responded: I'll need to search your chat history to find information about your idea. Let me do that for you.

[0m{}
[NOTICE] Using a notebook authentication token. Recommended for development only. For improved performance, please use Service Principal based authentication. To disable this message, pass disable_notice=True.
[36;1m[1;3mPassage: test
Passage: I don't see any previous conversations where you've mentioned your favorite fruit. Could you please let me know what fruit you like? Once you share that information, I'll remember it for future conversations.
Passage: test 12
[0m[32;1m[1;3mI don't see any specific chat history about an idea you've shared previously. The search results don't show any detailed information about an idea you might have mentioned. 

If you're referring to a specific idea you shared in a previous conversation, could you provide a bit more cont

### Search WITH filtering

In [105]:
input_example = {"messages": [ {"role": "user", "content": "What was my chat history idea?"}], "custom_inputs": {"filters": {"user_name": "tanner.wendland@databricks.com"}}}
answer = chain.invoke(input_example)
print(answer)




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `search_chat_history` with `chat history idea`
responded: I'll help you find information about your chat history idea. Let me search your previous conversations to locate this information.

[0m{'user_name': 'tanner.wendland@databricks.com'}
[NOTICE] Using a notebook authentication token. Recommended for development only. For improved performance, please use Service Principal based authentication. To disable this message, pass disable_notice=True.
[36;1m[1;3mPassage: Hello! I'm starting a new chat history!
Passage: Based on the chat history provided, the first message you sent was:

"I'm looking for a new laptop for work. I need something with good performance for running multiple applications simultaneously, a decent battery life, and preferably under $1500. Any recommendations?"

This was your initial query that started our conversation about laptop recommendations.
Passage: test chat 2
[0m[32;1m[1;3mBased 

In [109]:
input_example = {"messages": [ {"role": "user", "content": "What's the capital of Alaska?"}, {"role": "assistant", "content": "Juneau"}, {"role": "user", "content": "How far is it from Anchorage?"}], "custom_inputs": {"filters": {"user_name": "fake.user@databricks.com"}}}
answer = chain.invoke(input_example)
print(answer)

Role: user - Content: What's the capital of Alaska?
Role: assistant - Content: Juneau


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `search_chat_history` with `Alaska capital Juneau Anchorage distance`
responded: I'll need to find information about the distance between Anchorage and Juneau (the capital of Alaska that we discussed earlier).

[0m{'user_name': 'fake.user@databricks.com'}
[NOTICE] Using a notebook authentication token. Recommended for development only. For improved performance, please use Service Principal based authentication. To disable this message, pass disable_notice=True.
[36;1m[1;3m[0m[32;1m[1;3mBased on our previous conversation, you asked about the capital of Alaska, which is Juneau. Now you're asking about the distance from Anchorage to Juneau.

Juneau and Anchorage are approximately 571 miles (919 kilometers) apart as the crow flies. However, it's important to note that there is no direct road connection between these two major Al

In [112]:
input_example = {"messages": [ {"role": "user", "content": "What's the capital of France?"}], "custom_inputs": {"filters": {"user_name": "tanner.wendland@databricks.com"}}}
answer = chain.invoke(input_example)
print(answer)




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `search_chat_history` with `Databricks Genie`
responded: I'll help you find information about Databricks Genie. Let me search for any previous conversations about this topic first.

[0m{'user_name': 'tanner.wendland@databricks.com'}
[NOTICE] Using a notebook authentication token. Recommended for development only. For improved performance, please use Service Principal based authentication. To disable this message, pass disable_notice=True.
[36;1m[1;3mPassage: Tell me about the product Genie from Databricks?
Passage: I don't see any specific information about Databricks Genie in our previous conversations. Let me provide you with information about this product:

# Databricks Genie

Databricks Genie is an AI-powered data assistant that helps data teams work more efficiently with their data. Here are the key features and aspects of Databricks Genie:

## Key Features:
1. **Natural Language Interface**: Allows users 

## CHAIN.PY

This file is here to avoid serialziation issues with using a notebook.

In [15]:
from mlflow.models.resources import DatabricksVectorSearchIndex, DatabricksServingEndpoint

chain_file_path = os.path.join(os.getcwd(), 'chain.py')
if not os.path.exists(chain_file_path):
    raise FileNotFoundError(f"Chain file not found at {chain_file_path}")

# Log the model to MLflow
with mlflow.start_run(run_name="adtech_chat_history_agent"):
  logged_chain_info = mlflow.langchain.log_model(
          #Note: In classical ML, MLflow works by serializing the model object.  In generative AI, chains often include Python packages that do not serialize.  Here, we use MLflow's new code-based logging, where we saved our chain under the chain notebook and will use this code instead of trying to serialize the object.
          lc_model=os.path.join(os.getcwd(), 'chain.py'),  # Chain code file e.g., /path/to/the/chain.py 
          model_config=chain_config, # Chain configuration 
          artifact_path="chain", # Required by MLflow, the chain's code/config are saved in this directory
          input_example=input_example,
          # Specify resources for automatic authentication passthrough
          resources=[
            DatabricksVectorSearchIndex(index_name=model_config.get("vector_search_index")),
            DatabricksServingEndpoint(endpoint_name=model_config.get("llm_model_serving_endpoint_name"))
          ]
      )

model_name = "chat_history_agent"
MODEL_NAME_FQN = f"{target_catalog}.{target_schema}.{model_name}"
# Register to UC
uc_registered_model_info = mlflow.register_model(model_uri=logged_chain_info.model_uri, name=MODEL_NAME_FQN)

{"ts": "2025-08-01 12:41:37,710", "level": "ERROR", "logger": "pyspark.sql.connect.client.logging", "msg": "GRPC Error received", "context": {}, "exception": {"class": "_InactiveRpcError", "msg": "<_InactiveRpcError of RPC that terminated with:\n\tstatus = StatusCode.FAILED_PRECONDITION\n\tdetails = \"BAD_REQUEST: session_id is no longer usable. Generate a new session_id by detaching and reattaching the compute and then try again [sessionId=3c9af5a3-cdb1-4623-873f-7eeddad92a21, reason=INACTIVITY_TIMEOUT]. (requestId=8039babd-55cc-4fef-9b09-7be0080a1137)\"\n\tdebug_error_string = \"UNKNOWN:Error received from peer  {grpc_status:9, grpc_message:\"BAD_REQUEST: session_id is no longer usable. Generate a new session_id by detaching and reattaching the compute and then try again [sessionId=3c9af5a3-cdb1-4623-873f-7eeddad92a21, reason=INACTIVITY_TIMEOUT]. (requestId=8039babd-55cc-4fef-9b09-7be0080a1137)\"}\"\n>", "stacktrace": ["Traceback (most recent call last):", "  File \"/Users/tanner.wen

Exception: '/Users/tanner.wendland/projects/adtech_app_lakebase_vibe_session/data_pipelines/src/mlruns' does not exist.

In [None]:
# Actualy deploy the model to Databricks

In [None]:
from databricks.sdk import WorkspaceClient
from databricks.sdk.service.serving import EndpointCoreConfigInput, ServedEntityInput

workspace_client = WorkspaceClient()

version = uc_registered_model_info.version
serving_endpoint_name = MODEL_NAME_FQN.replace(".", "-")

config = {
        "served_entities": [
            {
                "name": serving_endpoint_name,
                "entity_name": MODEL_NAME_FQN,
                "entity_version": version,
                "workload_size": "Small",
                "scale_to_zero_enabled": True,
            }
        ]
    }

def does_endpoint_exists(endpoint_name):
    try:
        workspace_client.serving_endpoints.get(endpoint_name)
        return True
    except:
        return False

if not does_endpoint_exists(serving_endpoint_name):
    print(f"Creating endpoint {serving_endpoint_name}...")
    workspace_client.serving_endpoints.create_and_wait(
        serving_endpoint_name,
        config=EndpointCoreConfigInput.from_dict(config)
    )
else:
    print(f"Updating endpoint {serving_endpoint_name}...")
    workspace_client.serving_endpoints.update_config_and_wait(
        serving_endpoint_name,
        served_entities=[ServedEntityInput.from_dict(entity) for entity in config['served_entities']]
    )