In [0]:
%run ./config

In [0]:
import os
from azure.identity import DefaultAzureCredential, get_bearer_token_provider


os.environ["AZURE_CLIENT_ID"] = azure_client_id
os.environ["AZURE_TENANT_ID"] = azure_tenant_id
os.environ["AZURE_CLIENT_SECRET"] = dbutils.secrets.get(scope="azure",key="rag")

# 1. Initialize the credential object
credential = DefaultAzureCredential()

# 2. ⭐ Create the callable token provider using the helper function
# The scope tells Azure what service we want to access.
token_provider = get_bearer_token_provider(
    credential, "https://cognitiveservices.azure.com/.default"
)



In [0]:
import os
from pathlib import Path
# from azure.core.credentials import AzureKeyCredential # Not needed if using DefaultAzureCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
from azure.ai.projects import AIProjectClient
from azure.identity import DefaultAzureCredential
from azure.storage.blob import BlobServiceClient
from azure.ai.agents.models import FunctionTool # Make sure this is imported!
from dotenv import load_dotenv
from azure.ai.documentintelligence import DocumentIntelligenceClient
from langchain import hub
from langchain_openai import AzureChatOpenAI
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
from langchain_openai import AzureOpenAIEmbeddings
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.vectorstores.azuresearch import AzureSearch
from langchain_community.document_loaders import WebBaseLoader

load_dotenv() 

In [0]:
from azure.storage.blob import BlobServiceClient

# Config
STORAGE_ACCOUNT_NAME ="tfstate6666"
CONTAINER_NAME = "pdfs"

storage_account_url = f"https://{STORAGE_ACCOUNT_NAME}.blob.core.windows.net"

try:
  
    blob_service_client = BlobServiceClient(account_url=storage_account_url, credential=credential)
    container_client = blob_service_client.get_container_client(CONTAINER_NAME)
    print(f"✅ Blob service client initialized.")
    print(list(container_client.list_blobs()))
except Exception as e:
    print(f"Error during client initialization: {e}")
    exit()


In [0]:


# Define volume, folder, and file details.
catalog            = 'rag'
schema             = 'development'
volume             = 'blob'
folder             = 'markdown'
volume_path        = f"/Volumes/{catalog}/{schema}/{volume}/{folder}" # /Volumes/main/default/my-volume

display(dbutils.fs.ls(volume_path))



### load the files into a table for versioning with checkpointing 

In [0]:

df = (spark.readStream
        .format('cloudFiles')
        .option('cloudFiles.format', 'BINARYFILE')
        .option("pathGlobFilter", "*.md")
        .load('dbfs:'+volume_path))

# Write the data as a Delta table
(df.writeStream
  .trigger(availableNow=True)
  .option("checkpointLocation", f'dbfs:{volume_path}/checkpoints/')
  .table('rag.development.md_raw').awaitTermination())

In [0]:
%sql 
SELECT path, modificationTime FROM rag.development.md_raw LIMIT 2;


In [0]:
df = spark.sql("SELECT content FROM rag.development.md_raw")
# binary_data_column = df.select("content")
# md_binary_data = binary_data_column.collect()[0].content

In [0]:
from io import BytesIO
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain_core.documents import Document



# Spark/Databricks table and column names
TABLE_NAME = "rag.development.md_raw"
CONTENT_COLUMN = "content"  # The column with binary Markdown data
SOURCE_COLUMN = "path" # An identifier column (e.g., file name, URI, or ID)
chunk_size = os.getenv("CHUNK_SIZE")
overlap = os.getenv("OVERLAP")


# This will split the document based on Markdown headers (H1, H2, H3)
# and add the header text to each chunk's metadata.
headers_to_split_on = [
    ("#", "PageTitle"),
    ("##", "PageSubtitle"),
    ("###", "PageSection"),
]

# Initialize Text Splitter 

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

# Process Markdown from the Spark DataFrame 

all_final_chunks = []
rows_processed = 0

print(f"🔄 Querying Spark table '{TABLE_NAME}'...")

try:
    # Select the content and a source identifier column
    # Using .toLocalIterator() is memory-efficient for large tables
    query = f"SELECT {CONTENT_COLUMN}, {SOURCE_COLUMN} FROM {TABLE_NAME}"
    df = spark.sql(query)
    
    print(f"✅ Query successful. Processing rows...")

    for row in df.toLocalIterator():
        rows_processed += 1
        binary_md_data = row[CONTENT_COLUMN]
        source_identifier = row[SOURCE_COLUMN]

        print(f"  - Processing source: {source_identifier}")

        if not binary_md_data:
            print(f"    ⚠️ Warning: No binary data found for source '{source_identifier}'. Skipping.")
            continue
        
        # a. ⭐️ Decode the binary Markdown data into a text string
        try:
            # Markdown is text, so we decode it (UTF-8 is standard)
            md_text = binary_md_data.decode('utf-8')

        except Exception as e:
            print(f"    ❌ Error decoding Markdown for source '{source_identifier}'. Skipping. Error: {e}")
            continue

        # Split the text using the Markdown splitter
        # This method returns Document objects directly, including metadata for the headers.
        final_chunks = markdown_splitter.split_text(md_text)
        
        # Add the original source identifier to each chunk's metadata
        # The splitter already created metadata with header info, so we just add to it.
        for chunk in final_chunks:
            chunk.metadata["source"] = source_identifier
        
        all_final_chunks.extend(final_chunks)

except Exception as e:
    print(f"❌ An error occurred during Spark processing: {e}")




### Create a table in the catalog to hold the chunks & emeddings for versioning

In [0]:
%sql
CREATE TABLE IF NOT EXISTS rag.development.md_chunks (
  id BIGINT GENERATED BY DEFAULT AS IDENTITY, --Need a PK
  source STRING,
  PageTitle STRING, 
  PageSubtitle STRING,
  PageSection STRING,
  content STRING,
  embedding ARRAY <FLOAT>
) TBLPROPERTIES (delta.enableChangeDataFeed = true);

In [0]:
import os
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from openai import AzureOpenAI

# Target the delta table
TARGET_TABLE_NAME = "rag.development.md_chunks"

if final_chunks:
    print(f"Processing {len(final_chunks)} document chunks...")
    credential = DefaultAzureCredential()
    # 1. 🤖 Generate Embeddings from Azure OpenAI
    print("Generating embeddings for all chunks...")
    embedding_endpoint = "https://aifoundry6666.cognitiveservices.azure.com/openai/deployments/text-embedding-3-large/embeddings?api-version=2023-05-15"
    embedding_model_name = "text-embedding-3-large" 
    token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default")

    embedding_client = AzureOpenAI(
        api_version="2024-02-01",
        azure_endpoint=embedding_endpoint,
        azure_ad_token_provider=token_provider,
    )

    # Extract text content and generate embeddings in a single call
    chunks_text = [doc.page_content for doc in final_chunks]
    embedding_response = embedding_client.embeddings.create(model=model_name, input=chunks_text)
    embeddings = [item.embedding for item in embedding_response.data]
    print(f"✅ Generated {len(embeddings)} embedding vectors.")

    # 2. 📝 Combine chunks, metadata, and embeddings
    # Use zip to pair each document with its corresponding embedding vector
    data_for_df = [
        {
            "page_content": doc.page_content, 
            "embedding": emb,                 # <-- Added the new embedding
            **doc.metadata
        }
        for doc, emb in zip(final_chunks, embeddings)
    ]

    # 3. 🔄 Create and Align the Spark DataFrame
    print("Creating Spark DataFrame with embeddings...")
    df_chunks = spark.createDataFrame(data_for_df)

    # Rename 'page_content' to 'content' to match the target Delta table schema
    if "page_content" in df_chunks.columns:
        df_chunks = df_chunks.withColumnRenamed("page_content", "content")

    # Define the columns in the desired order for the final table, now including 'embedding'
    schema_columns = [
        "source",
        "PageTitle",
        "PageSubtitle",
        "PageSection",
        "content",
        "embedding"  # <-- Added embedding to the schema definition
    ]
    
    # Filter the list to include only columns that exist in our DataFrame.
    # This prevents errors if a metadata field is not present in all chunks.
    final_columns_to_select = [col for col in schema_columns if col in df_chunks.columns]
    
    # Select and reorder the columns to match the table structure
    df_to_write = df_chunks.select(final_columns_to_select)

    # 4. 💾 Write the DataFrame to the Delta table
    print(f"Appending {df_to_write.count()} chunks with embeddings to Delta table: {TARGET_TABLE_NAME}...")
    
    (df_to_write.write
      .format("delta")
      .mode("append")
      .option("mergeSchema", "true") 
      .saveAsTable(TARGET_TABLE_NAME))
      
    print(f"✅ Successfully saved to {TARGET_TABLE_NAME}")
    
    # 5. ✅ Verify the written data
    # print("Verifying the written data...")
    # display(spark.sql(f"SELECT source, PageTitle, content, embedding FROM {TARGET_TABLE_NAME} LIMIT 2"))

else:
    print("⚠️ No chunks were generated, so no data was processed or saved.")

# Build the AI search index

In [0]:
from azure.identity import DefaultAzureCredential
import uuid
from azure.core.exceptions import ResourceExistsError
from openai import AzureOpenAI
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    ScoringProfile, # Import ScoringProfile
    TextWeights,      # Import TextWeights
)

index_name="ragamuffin-index"
    # --- Step 6: Define and Create the Search Index ---
print("3. Defining and creating search index schema...")
index_client = SearchIndexClient(endpoint="https://search6666.search.windows.net", credential=credential)
# Use the correct dimensions for your embedding model (e.g., 3072 for text-embedding-3-large)

fields = [
    SimpleField(name="id", type="Edm.String", key=True),
    SearchableField(name="content", type="Edm.String", searchable=True),
    SearchableField(name="source", type="Edm.String", searchable=True),
    # Add header fields for filtering and context
    SearchableField(name="page_title", type="Edm.String", filterable=True, facetable=True),
    SearchableField(name="page_subtitle", type="Edm.String", filterable=True, facetable=True),
    SearchableField(name="page_section", type="Edm.String", filterable=True, facetable=True),
    SearchField(name="content_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                    searchable=True, vector_search_dimensions=os.getenv("EMBEDDING_DIMENSIONS"),
                    vector_search_profile_name="my-hnsw-profile")
]


vector_search = VectorSearch(
    profiles=[VectorSearchProfile(name="my-hnsw-profile", algorithm_configuration_name="my-hnsw-config")],
    algorithms=[HnswAlgorithmConfiguration(name="my-hnsw-config")]
)

# =Define the Scoring Profile to boost 'page_section' 
scoring_profile = ScoringProfile(
    name="boost_section_profile",
    text_weights=TextWeights(
        weights={
            "page_section": 5,
            "page_title": 3,
            "page_subtitle": 2,
            "content": 1
        }
    )
)

index = SearchIndex(name=index_name, 
                    fields=fields, 
                    vector_search=vector_search,
                    scoring_profiles=[scoring_profile]
                    )

try:
    index_client.create_index(index)
    print(f"   Index '{os.getenv("AZURE_SEARCH_INDEX_NAME")}' created.")
except ResourceExistsError:
    print(f"   Index '{os.getenv("AZURE_SEARCH_INDEX_NAME")}' already exists.")

# --- Step 7: Prepare and Upload Documents 📤 ---
print("4. Preparing and uploading documents to the index...")
documents_to_upload = []
for i, doc in enumerate(all_final_chunks):
    documents_to_upload.append({
        "id": str(uuid.uuid4()),
        "content": doc.page_content,
        "content_vector": embeddings[i],
        "source": doc.metadata.get("source"),
        "page_title": str(doc.metadata.get("Page Title")), 
        "page_subtitle": str(doc.metadata.get("Page Subtitle")),
        "page_section": str(doc.metadata.get("Page Section")) 
    })
    
search_client = SearchClient(endpoint=os.getenv("AZURE_SEARCH_ENDPOINT"), index_name=os.getenv("AZURE_SEARCH_INDEX_NAME"), credential=credential)
search_client.upload_documents(documents=documents_to_upload)
print("   ✅ Upload complete!")

 

## Use the Search SDK and track with mlflow to test retriever

In [0]:
from azure.search.documents.models import VectorizedQuery
import mlflow

# Assume embedding_client, search_client, and model_name are already configured
# from a previous setup.

@mlflow.trace()
def retrieve_from_azure_ai_search(query: str, top_k: int = 3):
    """
    Performs a vector search in Azure AI Search and logs details to MLflow.
    """
    AZURE_SEARCH_ENDPOINT= "https://search6666.search.windows.net"
    search_client = SearchClient(endpoint=AZURE_SEARCH_ENDPOINT,
                                  index_name=AZURE_SEARCH_ENDPOINT, 
                                  credential=credential)
    params = {
        "search_text": query,
        "embedding_model": model_name,
        "search_index_name": index_name,
        "top_k": top_k,
        "search_type": "hnsw",
        "embedding_dimensions": os.getenv("EMBEDDING_DIMENSIONS"),
        "chunk_size": chunk_size,
        "overlap": overlap,
    }
    mlflow.log_params(params)


    # Generate the query vector
    search_client = SearchClient(endpoint=AZURE_SEARCH_ENDPOINT, index_name=index_name, credential=credential)
    response = embedding_client.embeddings.create(input=query, model=model_name)
    query_vector = response.data[0].embedding

    vector_query = VectorizedQuery(
        vector=query_vector,
        k_nearest_neighbors=top_k,
        fields="content_vector"  # The name of the vector field in your index
    )

    # Perform the vector search
    results = search_client.search(
        select=["source","page_title","page_subtitle","page_section", "content"],
        vector_queries=[vector_query],
        top=top_k
    )

    # Format the retrieved documents
    retrieved_docs = [{"source": doc["source"],"page_title": doc["page_title"],"page_subtitle": doc["page_subtitle"],"page_subtitle": doc["page_subtitle"],"content": doc["content"],  "score": doc["@search.score"]} for doc in results]
        # --- Log RAG Metrics ---
    if retrieved_docs:
        scores = [doc["score"] for doc in retrieved_docs]
        mlflow.log_metric("retrieved_docs_count", len(retrieved_docs))
        mlflow.log_metric("average_retrieval_score", sum(scores) / len(scores))
        mlflow.log_metric("min_retrieval_score", min(scores))

    return retrieved_docs


# Create experiment and set tags
mlflow.set_experiment("/Users/huy.d@hotmail.com/RAG_with_Azure_AI_Search_exp")
description = "Evaluating retriever"
experiment_tags = {
    "project": "RAG",
    "domain": "DA",
    "purpose": "Retrieval evaluation"
}


mlflow.set_experiment_tags(experiment_tags)


# Use a context manager to ensure the run is properly managed 
with mlflow.start_run(run_name="Retrieval Test"):
    print("\nPerforming tracked retrieval inside an MLflow run...")
    
    retrieved_documents = retrieve_from_azure_ai_search(
        query="what is a ragamuffin",
        top_k=5
    )

    # print("\nRetrieved Documents:")
    # for doc in retrieved_documents:
    #     print(f"  page_section: {doc.get('page_section')}, Score: {doc.get('score'):.4f}, Text: {doc.get('content')}")

print("\n✅ Retrieval process tracked in MLflow. Run 'mlflow ui' to view the trace.")

## Build out the RAG Experiment

In [0]:
import os
import time
import mlflow
import pandas as pd
from openai import AzureOpenAI
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizedQuery
from mlflow.models import ModelSignature
from mlflow.types import Schema, ColSpec
from dotenv import load_dotenv

# Load environment variables from the .env file
load_dotenv()

# --- Configuration ---
# Values are now loaded from the .env file
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_SEARCH_ENDPOINT = os.getenv("AZURE_SEARCH_ENDPOINT")
AZURE_SEARCH_INDEX_NAME = os.getenv("AZURE_SEARCH_INDEX_NAME")
SEARCH_TYPE=os.getenv("SEARCH_TYPE")
CHAT_MODEL_DEPLOYMENT = os.getenv("CHAT_MODEL_DEPLOYMENT")
EMBEDDING_MODEL_NAME = os.getenv("EMBEDDING_MODEL_NAME")
EMEDDING_ENDPOINT=os.getenv("EMBEDDING_ENDPOINT")
OPENAI_API_VERSION = os.getenv("OPENAI_API_VERSION")
GENERATION_TEMPERATURE=os.getenv("GENERATION_TEMPERATURE")
MAX_TOKENS=os.getenv("MAX_TOKENS")
CHUNK_SIZE=os.getenv("CHUNK_SIZE")
OVERLAP=os.getenv("OVERLAP")
EMBEDDING_DIMENSIONS=os.getenv("EMBEDDING_DIMENSIONS")
os.environ["AZURE_CLIENT_ID"] = azure_client_id
os.environ["AZURE_TENANT_ID"] = azure_tenant_id
os.environ["AZURE_CLIENT_SECRET"] = dbutils.secrets.get(scope="azure",key="rag")
# Define the Delta table used as the source for the RAG knowledge base
SOURCE_DELTA_TABLE = os.getenv("SOURCE_DELTA_TABLE")
SEARCH_TYPE=os.getenv("SEARCH_TYPE")
# --- MLflow Setup ---
MLFLOW_EXPERIMENT_PATH = os.getenv("MLFLOW_EXPERIMENT_PATH")
mlflow.set_experiment(MLFLOW_EXPERIMENT_PATH)
print(MAX_TOKENS)
# ✨ NEW: Define constants for logging
SYSTEM_PROMPT = "You are an intelligent assistant. Use the context provided to answer the user's question."
RETRIEVAL_FIELDS = ["source", "page_title", "content"]

# --- Client Initialization (for experiment run) ---
credential = DefaultAzureCredential()
token_provider = get_bearer_token_provider(credential, "https://cognitiveservices.azure.com/.default")

aoai_client = AzureOpenAI(
    api_version=OPENAI_API_VERSION,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    azure_ad_token_provider=token_provider,
)
embedding_client = AzureOpenAI(
    api_version=OPENAI_API_VERSION,
    azure_endpoint=EMEDDING_ENDPOINT,
    azure_ad_token_provider=token_provider,
)
search_client = SearchClient(
    endpoint=AZURE_SEARCH_ENDPOINT,
    index_name=AZURE_SEARCH_INDEX_NAME,
    credential=credential
)

# --- RAG Functions (for tracing during experiment) ---
def retrieve_documents(query, top_k):
    embedding_response = embedding_client.embeddings.create(model=EMBEDDING_MODEL_NAME, input=query)
    query_vector = embedding_response.data[0].embedding
    vector_query = VectorizedQuery(vector=query_vector, k_nearest_neighbors=top_k, fields="content_vector")

    results = search_client.search(
        select=RETRIEVAL_FIELDS, # ✨ UPDATED: Use the constant
        vector_queries=[vector_query],
        top=top_k
    )
    return [{"source": res["source"], "page_title": res["page_title"], "content": res["content"]} for res in results]

def generate_answer(query, retrieved_docs):
    context = "\n\n".join([doc["content"] for doc in retrieved_docs])
    # ✨ UPDATED: system_message is now a constant
    user_message = f"CONTEXT:\n---\n{context}\n---\nQUESTION: {query}"
    messages = [{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": user_message}]

    chat_response = aoai_client.chat.completions.create(
        model=CHAT_MODEL_DEPLOYMENT,
        messages=messages,
        temperature=float(GENERATION_TEMPERATURE),
        max_tokens=int(MAX_TOKENS)
    )
    token_usage = chat_response.usage
    return chat_response.choices[0].message.content, user_message, token_usage

@mlflow.trace()
def run_rag_pipeline_traced(query, top_k):
    """Traced function for detailed pipeline analysis."""
    with mlflow.start_span("retrieval") as span:
        start_time = time.time()
        documents = retrieve_documents(query, top_k)
        retrieval_time = time.time() - start_time
        span.set_outputs({"documents": documents})

    with mlflow.start_span("generation") as span:
        start_time = time.time()
        final_answer, augmented_prompt, token_usage = generate_answer(query, documents)
        generation_time = time.time() - start_time
        span.set_outputs({"answer": final_answer})

    return final_answer, documents, augmented_prompt, retrieval_time, generation_time, token_usage

# --- Main Experiment and Registration Run ---
user_query = "What are ragamuffins?"
top_k_value = 5

with mlflow.start_run(run_name="RAG Experiment and Model Registration") as run:
    run_id = run.info.run_id
    print(f"🚀 Starting MLflow run: {run.info.run_name} ({run_id})")
    print(f"Linking source dataset: {SOURCE_DELTA_TABLE}")

    source_dataset = mlflow.data.load_delta(
        table_name=SOURCE_DELTA_TABLE # Pass the version here
    )

    # 3. Log the dataset as an input to the run for traceability
    mlflow.log_input(source_dataset, context="source_documents")

    # === Part 1: Run Experiment & Log Results ===
    # ✨ UPDATED: Added new parameters to the log
    mlflow.log_params({
        "top_k": top_k_value,
        "embedding_model": EMBEDDING_MODEL_NAME,
        "chat_model": CHAT_MODEL_DEPLOYMENT,
        "search_index_name": AZURE_SEARCH_INDEX_NAME,
        "system_prompt": SYSTEM_PROMPT,
        "search_type": SEARCH_TYPE,
        "chunk_size":CHUNK_SIZE,
        "overlap":OVERLAP,
        "embedding_dimensions": EMBEDDING_DIMENSIONS,
        "temperature": GENERATION_TEMPERATURE,
        "retrieval_fields": RETRIEVAL_FIELDS
    })

    # Execute the traced pipeline
    final_answer, docs, prompt, ret_time, gen_time, tokens = run_rag_pipeline_traced(
        query=user_query,
        top_k=top_k_value
    )

    # Log metrics and artifacts
    mlflow.log_metrics({
        "retrieval_time_sec": round(ret_time, 2),
        "generation_time_sec": round(gen_time, 2),
        "total_time_sec": round(ret_time + gen_time, 2),
        "prompt_tokens": tokens.prompt_tokens,
        "completion_tokens": tokens.completion_tokens,
        "total_tokens": tokens.total_tokens,
    })

    rag_table = pd.DataFrame({
        "prompt": [user_query],
        "augmented_prompt": [prompt],
        "final_answer": [final_answer],
        "retrieved_documents": [str(docs)]
    })
    mlflow.log_table(data=rag_table, artifact_file="rag_results.json")

    print("✅ Experiment results logged.")
    print(f"\nFinal Answer:\n{final_answer}\n")

    # === Part 2: Log and Register the Self-Contained Model ===
    print("🎬 Starting model logging and registration...")

    # Define the model signature
    input_schema = Schema([ColSpec("string", "question"), ColSpec("long", "top_k")])
    output_schema = Schema([
    ColSpec("string", "answer"), 
    ColSpec("string", "retrieved_documents")])
    signature = ModelSignature(inputs=input_schema, outputs=output_schema)

    # Define an input example
    input_example = pd.DataFrame({"question": [user_query], "top_k": [top_k_value]})

    # Define pip requirements for the model environment
    pip_requirements = [
        "mlflow>=2.10",
        "pandas",
        "openai>=1.12.0",
        "azure-identity>=1.15.0",
        "azure-search-documents>=11.4.0"
    ]

    # Log the model using the code from rag_model.py
    model_info = mlflow.pyfunc.log_model(
        artifact_path="rag_model",
        python_model="rag_model.py", # Log an instance of the class
        input_example=input_example,
        pip_requirements=pip_requirements,
        signature=signature
    )

    print("📦 Model logged successfully.")

    # Register the logged model to the Unity Catalog Model Registry
    print("🖋️ Registering model in the Model Registry...")
    registered_model = mlflow.register_model(
        model_uri=model_info.model_uri,
        name="rag.development.rag_model" # UC 3-level name: catalog.schema.model
    )
    print(f"✅ Model '{registered_model.name}' version {registered_model.version} registered!")

In [0]:
import mlflow.pyfunc
import pandas as pd

# Load the registered model
model_name = "rag.development.rag_model"
model_version = "10"  # Specify the version you want to use
model_uri = f"models:/{model_name}/{model_version}"

# Load the model as a PyFunc model
loaded_model = mlflow.pyfunc.load_model(model_uri)

# Define the input data
input_data = pd.DataFrame({
    "question": ["What are ragamuffins?"],
    "top_k": [5]
})

# Use the model for inference
predictions = loaded_model.predict(input_data)

# Display the predictions
display(predictions)

In [0]:


from databricks.sdk import WorkspaceClient
from databricks.sdk.service.serving import EndpointCoreConfigInput, ServedModelInput

w = WorkspaceClient()

# 1. Define the endpoint and model details
endpoint_name = "rag-chatbot-endpoint"
model_name = "rag.development.rag_model"
model_version = "10"


env_vars = {
    # --- Secrets fetched from Databricks Secret Scope 'azure' ---
    "AZURE_CLIENT_ID":       "7318b99c-c3ab-483e-979f-34c7e6bad8ea",
    "AZURE_TENANT_ID":       "7f6a2cf9-5e4e-46ae-95d4-74016c1df1a6",
    "AZURE_CLIENT_SECRET":   "{{secrets/azure/rag}}",

    # --- Azure Service Endpoints & Versions (as plain text) ---
    "OPENAI_API_VERSION":      "2024-02-01",
    "AZURE_OPENAI_ENDPOINT":   "https://aifoundry6666.openai.azure.com/",
    "AZURE_SEARCH_ENDPOINT":   "https://search6666.search.windows.net",
    "AZURE_SEARCH_INDEX_NAME": "ragamuffin-index",

    # --- Model & Embedding Configuration (as plain text) ---
    "EMBEDDING_MODEL_NAME":    "text-embedding-3-large",
    "CHAT_MODEL_DEPLOYMENT":   "gpt-4.1-mini",
    "SYSTEM_MESSAGE_PROMPT":   "You are an intelligent assistant...",
    "TOP_K":                   "5",
    "GENERATION_TEMPERATURE":  "0.1"
}

# This specifies which model version to deploy, the workload size, and scaling.
served_model = ServedModelInput(
    model_name=model_name,
    model_version=model_version,
    workload_size="Small",  # Options: "Small", "Medium", "Large"
    scale_to_zero_enabled=True, # Recommended to save costs
    environment_vars=env_vars

)


# An endpoint can serve multiple models, so the config takes a list.
endpoint_config = EndpointCoreConfigInput(
    served_models=[served_model]
)

# 4. Create the endpoint and wait for it to be ready
print(f"Creating or updating endpoint: {endpoint_name}")
w.serving_endpoints.create_and_wait(
    name=endpoint_name,
    config=endpoint_config
)

print(f"✅ Endpoint '{endpoint_name}' is now active and serving version {model_version}.")