In [0]:
%pip install --upgrade langchain-databricks langchain-community langchain databricks-sql-connector nltk rouge-score langchain-experimental duckduckgo-search

Collecting langchain-databricks
  Using cached langchain_databricks-0.1.1-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain-community
  Using cached langchain_community-0.3.7-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain
  Using cached langchain-0.3.7-py3-none-any.whl.metadata (7.1 kB)
Collecting databricks-sql-connector
  Using cached databricks_sql_connector-3.6.0-py3-none-any.whl.metadata (4.4 kB)
Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting rouge-score
  Using cached rouge_score-0.1.2-py3-none-any.whl
Collecting databricks-vectorsearch<0.41,>=0.40 (from langchain-databricks)
  Using cached databricks_vectorsearch-0.40-py3-none-any.whl.metadata (2.8 kB)
Collecting langchain-core<0.4,>=0.2.35 (from langchain-databricks)
  Using cached langchain_core-0.3.19-py3-none-any.whl.metadata (6.3 kB)
Collecting mlflow>=2.16.0 (from langchain-databricks)
  Using cached mlflow-2.18.0-py3-none-any.whl.metadata (29 kB)
Collecting SQLAlchem

In [0]:
%restart_python or dbutils.library.restartPython()

# Proof of concept (POC)

Here we want to make a simple chain that takes a question and gives an answer

In [0]:
import os
from langchain import hub
from langchain.agents import Tool, create_react_agent
from langchain.agents.agent import AgentExecutor
from langchain_experimental.utilities import PythonREPL
from langchain_community.tools import DuckDuckGoSearchRun
from langchain.tools import tool
import mlflow
from langchain_databricks import ChatDatabricks

chat_model = ChatDatabricks(
                            endpoint="databricks-meta-llama-3-1-70b-instruct",
                            temperature=0.1,
                            max_tokens=250,
                        )

In [0]:
# Tools
python_repl = PythonREPL()
search = DuckDuckGoSearchRun()
repl_tool = Tool(
    name="python_repl",
    description="A Python shell. Use this to execute python commands. Input should be a valid python command. If you want to see the output of a value, you should print it out with `print(...)`.",
    func=python_repl.run,
)
search_tool = Tool(
    name="search",
    description="Search the web for information",
    func=search.run,
)
@tool
def make_a_poem(topic: str) -> str:
    """Make a poem about a topic."""
    prompt = f"Make a poem about {topic}"
    response = chat_model.invoke(prompt)
    return response.content

# prompt
prompt = hub.pull("hwchase17/react")




In [0]:
# create agents
agent = create_react_agent(
    llm=chat_model,
    tools=[repl_tool, search_tool, make_a_poem],
    prompt=prompt,
)
agent_executor = AgentExecutor(
    agent=agent,
    tools=[repl_tool, search_tool, make_a_poem],
    max_iterations=5,
    handle_parsing_errors=True,
    verbose=True
)

In [0]:
query ="what is EPM in colombia?"
agent_executor.invoke({"input": query})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mAction: search
Action Input: EPM in Colombia[0m[33;1m[1;3mImage by EPM (www.epm.com.co) US-based Invenergy LLC has signed a strategic agreement with Colombian power utility Empresas Publicas de Medellin ESP (EPM) to develop and build at least 400 MW of solar and wind projects in Colombia by 2025. The deal, announced on Wednesday, will see the city of Medellin-owned company sell the power generated by ... Colombian utility Empresas Publicas de Medellin (EPM) announced on Friday that its 83-MW Tepuy solar farm in the central-west part of Colombia had been connected to the national grid and reached commercial operation. The Tepuy project, located in the municipality of La Dorada, department of Caldas, required around COP 397 billion (USD 96.1m/EUR ... Colombian public utility Empresas Públicas de Medellín (EPM) has reached commercial operation at a 83MW solar PV plant. Located in the central department of Caldas, nearly 200,0

{'input': 'what is EPM in colombia?',
 'output': 'EPM in Colombia refers to Empresas Publicas de Medellin, a Colombian power utility company involved in various energy projects.'}

# ML Flow

## Helper functions

In [0]:
def log_chain_to_mlflow(chain, wrapper, experiment_name: str = "example-chain"):
    """Helper function for logging a chain to MLflow"""
    # Get Path for MLflow
    if IS_DATABRICKS:
        # Use workspace path for Databricks
        experiment_path = f"/Shared/{experiment_name}"
        try:
            experiment = mlflow.get_experiment_by_name(experiment_path)
            if experiment is None:
                mlflow.create_experiment(experiment_path)
            mlflow.set_experiment(experiment_path)
        except Exception as e:
            print(f"Error setting up Databricks experiment: {e}")
            raise
    else:
        # Local experiment setup
        experiment = mlflow.get_experiment_by_name(experiment_name)
        if experiment is None:
            mlflow.create_experiment(experiment_name)
        mlflow.set_experiment(experiment_name)

    # Define model signature
    from mlflow.models.signature import ModelSignature
    from mlflow.types import Schema, ColSpec, DataType
    
    # Define input schema (question column)
    input_schema = Schema([
        ColSpec(DataType.string, "input")
    ])
    
    # Define output schema (model returns string)
    output_schema = Schema([
        ColSpec(DataType.string)
    ])
    
    # Create signature
    signature = ModelSignature(inputs=input_schema, outputs=output_schema)

    # Start logging of model
    with mlflow.start_run() as run:
        config = chain.get_config()
        mlflow.log_params({
            "model_name": config["model_name"],
            "temperature": config["temperature"],
            "environment": "databricks" if IS_DATABRICKS else "local"
        })

        # Create temporary file for prompt template
        prompt_path = None
        with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
            f.write(str(config["prompt_template"]))
            prompt_path = f.name

        try:
            # Create wrapper
            wrapper = wrapper(chain)
            # Log the model with requirements and signature
            requirements = [
            "langchain-databricks",
            "langchain-community",
            "langchain databricks-sql-connector",
            "langchain-experimental",
            "duckduckgo-search"
            ]
            if IS_DATABRICKS:
                requirements.append("databricks-mlflow")
            
            logged_model = mlflow.pyfunc.log_model(
                artifact_path="artifacts",
                python_model=wrapper,
                artifacts={"prompt_template": prompt_path},
                pip_requirements=requirements,
                signature=signature  # Add the signature here
            )
        finally:
            if prompt_path and os.path.exists(prompt_path):
                os.unlink(prompt_path)

    return run.info.run_id, logged_model.model_uri

def load_chain_from_mlflow(run_id: str, experiment_name = "example_chain", ):
    """
    Helper function to load a chain from MLFlow
    """
    try:
        if IS_DATABRICKS:
            model_uri = f"runs:/{run_id}/artifacts"
            chain = mlflow.pyfunc.load_model(model_uri)
            return chain
        else:
            model_uri = f"runs:/{run_id}/{experiment_name}"
            chain = mlflow.pyfunc.load_model(model_uri)
            return chain
    except Exception as e:
        print(f"Error loading chain from MLflow: {e}")
        raise

## Defining chain as a Model with a wrapper

Class for the chain

In [0]:
import mlflow
from langchain_databricks import ChatDatabricks
import pandas as pd
from typing import Dict, Any
import os
import tempfile

try:
    import databricks.mlflow
    IS_DATABRICKS = True
    import dbutils
except ImportError:
    IS_DATABRICKS = False

class myChain:
    def __init__(self, model_name, prompt_template = hub.pull("hwchase17/react"), temperature=0):
        self.model_name = model_name
        self.prompt_template = prompt_template
        self.temperature = temperature

    def run_chain(self, query):
        # Model
        chat_model = ChatDatabricks(
                            endpoint=self.model_name,
                            temperature=0.1,
                            max_tokens=250,
                        )
        # Tools
        python_repl = PythonREPL()
        search = DuckDuckGoSearchRun()
        repl_tool = Tool(
            name="python_repl",
            description="A Python shell. Use this to execute python commands. Input should be a valid python command. If you want to see the output of a value, you should print it out with `print(...)`.",
            func=python_repl.run,
        )
        search_tool = Tool(
            name="search",
            description="Search the web for information",
            func=search.run,
        )
        @tool
        def make_a_poem(topic: str) -> str:
            """Make a poem about a topic."""
            prompt = f"Make a poem about {topic}"
            response = chat_model.invoke(prompt)
            return response.content
        
        # Agents
        agent = create_react_agent(
            llm=chat_model,
            tools=[repl_tool, search_tool, make_a_poem],
            prompt=self.prompt_template,
        )
        agent_executor = AgentExecutor(
            agent=agent,
            tools=[repl_tool, search_tool, make_a_poem],
            max_iterations=5,
            handle_parsing_errors=True,
            verbose=True
        )
        return agent_executor.invoke({"input": query})['output']

    def __call__(self, query):
        result = self.run_chain(query)
        return result
    
    def get_config(self) -> Dict[str, Any]:
        """Get chain configuration for MLflow tracking"""
        return {
            "model_name": self.model_name,
            "temperature": self.temperature,
            "prompt_template": self.prompt_template
        }



Wrapper for MLFlow

In [0]:
class myChainWrapper(mlflow.pyfunc.PythonModel):
    def __init__(self, chain=None):
        self.chain = chain
        
    def predict(self, context, model_input):
        """
        :param context: MLflow model context
        :param model_input: DataFrame or Series containing YouTube URLs
        :return: List of summaries
        """
        questions = model_input['input'].to_list()
            
        return [self.chain(question) for question in questions]


# create an experiment

In [0]:
model_name = "databricks-meta-llama-3-1-70b-instruct"

chain = myChain(model_name=model_name)

run_id, model_uri = log_chain_to_mlflow(chain=chain, 
                             wrapper=myChainWrapper, experiment_name="/Users/guillermo.angarita.gutierrez@gmail.com/agent_chain")

2024/11/21 10:32:30 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

2024/11/21 10:32:30 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/11/21 10:32:30 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


🏃 View run flawless-mare-493 at: https://dbc-44a2cc4b-08f0.cloud.databricks.com/ml/experiments/2642639785185002/runs/bb7d7bd184214e2b908dfb66637e3188
🧪 View experiment at: https://dbc-44a2cc4b-08f0.cloud.databricks.com/ml/experiments/2642639785185002


In [0]:
run_id

'bb7d7bd184214e2b908dfb66637e3188'

In [0]:
model_uri

'runs:/bb7d7bd184214e2b908dfb66637e3188/artifacts'

# Monitoring model

## Helper functions

In [0]:
import psutil
import plotly.express as px
import plotly.graph_objects as go
import time
mlflow.enable_system_metrics_logging()

class PerformanceMonitor:
    def __init__(self):
        self.metrics_history = []
    
    def measure_latency(self, func, *args, **kwargs):
        """
        Measure execution time of a function
        
        :param func: Function to measure
        :param args: Positional arguments for the function
        :param kwargs: Keyword arguments for the function
        :return: tuple of (results, execution_time)
        """
        start_time = time.time()
        results = func(*args, **kwargs)  # Just store the results directly
        end_time = time.time()
        return results, end_time - start_time
    
    def measure_resource_usage(self):
        """Measure CPU and memory usage"""
        cpu_percent = psutil.cpu_percent()
        memory_info = psutil.Process().memory_info()
        return {
            'cpu_percent': cpu_percent,
            'memory_mb': memory_info.rss / 1024 / 1024
        }
    
    def calculate_text_metrics(self, question: str, response: str) -> dict:
        """
        Calculate text-based metrics like reduction percentage and lengths
        """
        question_length = len(question.split())
        response_length = len(response.split())
        
        return {
            'question_length': question_length,
            'response_length': response_length,
        }
    
    def log_performance(self, latency, question: str, response:str, resource_usage: dict):
        """
        Log all performance metrics including text metrics
        """
        text_metrics = self.calculate_text_metrics(question, response)
        metrics = {
            'latency': latency,
            **resource_usage,
            **text_metrics
        }
        self.metrics_history.append(metrics)
        self.log_metrics(metrics)
        return metrics
    
    def log_metrics(self, metrics_dict):
        """Log metrics to MLflow"""
        with mlflow.start_run():
            mlflow.log_metrics(metrics_dict)

In [0]:
def plot_metrics_over_time(metrics_history):
    """
    Create interactive plots for metrics over time including text metrics
    
    :param metrics_history: List of dictionaries containing metrics data
    """
    df = pd.DataFrame(metrics_history)
    
    # Latency plot
    fig_latency = px.line(df, y='latency', title='Inference Latency Over Time')
    fig_latency.show()
    
    # Resource usage plot
    fig_resources = go.Figure()
    fig_resources.add_trace(go.Scatter(y=df['cpu_percent'], name='CPU %'))
    fig_resources.add_trace(go.Scatter(y=df['memory_mb'], name='Memory (MB)'))
    fig_resources.update_layout(title='Resource Usage Over Time')
    fig_resources.show()
    
    # Text metrics plot
    fig_text = go.Figure()
    text_metrics = ['question_length', 'response_length']
    for metric in text_metrics:
        if metric in df.columns:
            fig_text.add_trace(go.Scatter(y=df[metric], name=metric))
    fig_text.update_layout(title='Text Metrics Over Time')
    fig_text.show()

## Inference model

In [0]:
def process_queries(input_df, ml_flow_chain, monitor):
    """
    Process multiple videos with monitoring
    
    :param video_ids: List of YouTube video IDs or URLs
    :return: tuple of (list of summaries, list of metrics)
    """
    # Process all videos
    responses = []
    metrics_list = []
    
    # Generate summaries with latency measurement
    results, total_latency = monitor.measure_latency(
        lambda: ml_flow_chain.predict(input_df)
    )
    
    # Calculate average latency per video
    avg_latency = total_latency / len(input_df)
    
    # Process each result
    for idx, result in enumerate(results):
        # Measure resource usage
        resource_usage = monitor.measure_resource_usage()

        # Log metrics for each video
        metrics = monitor.log_performance(avg_latency, input_df['input'].iloc[idx], result, resource_usage)
        
        responses.append(result)
        metrics_list.append(metrics)
    
    return responses, metrics_list

In [0]:
# Initialize the performance monitor
monitor = PerformanceMonitor()

ml_flow_chain = mlflow.pyfunc.load_model(model_uri)

questions = ["Qué es EPM en Colombia?",
              "What is Databricks?",
              "Que es langchain?"]

import pandas as pd
input_df = pd.DataFrame({'input': questions})

answers, metrics = process_queries(input_df, ml_flow_chain, monitor)
plot_metrics_over_time(monitor.metrics_history)

Downloading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo answer this question, I need to search for information about EPM in Colombia.

Action: search
Action Input: EPM Colombia[0m[33;1m[1;3mEPM informó una alianza con Klik Energy, startup que ofrece soluciones tecnológicas para la gestión eficiente de la energía, y esperan agregar hasta 13 gigavatios diariamente al Sistema Interconectado Nacional, SIN, con base en el ahorro de grandes consumidores. De acuerdo con ambas empresas, la idea es "contribuir al abastecimiento de energía en el país, mediante una solución que ... EPM dio a conocer los avances de Hidroituango y respondió las inquietudes que hay en torno a la construcción de la megaobra y a los costos asociados a la misma. ... Colombia ofrece apoyo para ... Lo invitamos a conocer los hechos noticiosos de Bogotá, Colombia y el mundo, a través del noticiero de CityNoticias de las 8 p. m. de este miércoles, 13 de noviembre. Bogotá 07:18 P.M. Empresas Públicas de Medellín 

2024/11/21 10:34:09 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
2024/11/21 10:34:09 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/11/21 10:34:10 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


🏃 View run thoughtful-fox-577 at: https://dbc-44a2cc4b-08f0.cloud.databricks.com/ml/experiments/2642639785185002/runs/a1ca87ae39d54cd0beb1c4ba28d93121
🧪 View experiment at: https://dbc-44a2cc4b-08f0.cloud.databricks.com/ml/experiments/2642639785185002


2024/11/21 10:34:10 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


🏃 View run marvelous-sloth-935 at: https://dbc-44a2cc4b-08f0.cloud.databricks.com/ml/experiments/2642639785185002/runs/051899d84a68495ebf366e7a73517856
🧪 View experiment at: https://dbc-44a2cc4b-08f0.cloud.databricks.com/ml/experiments/2642639785185002


2024/11/21 10:34:10 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/11/21 10:34:10 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!
2024/11/21 10:34:10 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
2024/11/21 10:34:10 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/11/21 10:34:10 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


🏃 View run mysterious-fish-987 at: https://dbc-44a2cc4b-08f0.cloud.databricks.com/ml/experiments/2642639785185002/runs/a679500968c143b7bd594a9e7e3cdc9b
🧪 View experiment at: https://dbc-44a2cc4b-08f0.cloud.databricks.com/ml/experiments/2642639785185002


# Evaluate Model

## Helper functions

In [0]:
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import nltk
from typing import List
import matplotlib.pyplot as plt
import numpy as np
# Download required NLTK data
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:
def evaluate_model_with_mlflow(model, test_data: List[Dict[str, str]], experiment_name: str):
    """Evaluate model and log results to MLflow"""
    mlflow.set_experiment(experiment_name)
    
    with mlflow.start_run(run_name="model_evaluation") as run:
        all_metrics = []
        
        # Evaluate each test example
        for i, example in enumerate(test_data):
            try:
                # Create DataFrame input
                input_df = pd.DataFrame({'input': [example['question']]})
                
                # Generate prediction
                result = model.predict(input_df)[0]  # Get first result
                
                # Calculate metrics
                metrics = calculate_metrics(example['question'], result, example['answer'])
                all_metrics.append(metrics)
                
                # Log metrics for each example
                for metric_name, value in metrics.items():
                    mlflow.log_metric(f"example_{i}_{metric_name}", value)
                
                # Log summaries as artifacts
                example_dir = f"example_{i}"
                os.makedirs(example_dir, exist_ok=True)
                
                with open(f"{example_dir}/predicted_answer.txt", "w") as f:
                    f.write(result)
                with open(f"{example_dir}/reference_answer.txt", "w") as f:
                    f.write(example['answer'])
                
                mlflow.log_artifacts(example_dir)
                
            except Exception as e:
                print(f"Error processing example {i}: {str(e)}")
                continue
        
        if not all_metrics:
            print("No successful evaluations completed")
            return None, None
            
        # Calculate and log average metrics
        avg_metrics = {}
        for metric in all_metrics[0].keys():
            avg_value = np.mean([m[metric] for m in all_metrics])
            avg_metrics[f"avg_{metric}"] = avg_value
            mlflow.log_metric(f"avg_{metric}", avg_value)
        
        # Create and log visualizations
        create_and_log_visualizations(all_metrics)
        
        return run.info.run_id, avg_metrics

## Defining metrics

In [0]:
import re
@mlflow.trace(name="calculating metrics")
def calculate_metrics(question: str, answer: str, reference_answer: str) -> Dict[str, float]:
    """Calculate various evaluation metrics"""
    # ROUGE scores
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = scorer.score(reference_answer, answer)
    
    # BLEU score
    reference = [reference_answer.split()]
    candidate = answer.split()
    bleu = sentence_bleu(reference, candidate)
    
    # Summary length metrics
    pred_length = len(answer.split())
    ref_length = len(reference_answer.split())
    length_ratio = pred_length / ref_length if ref_length > 0 else 0
    
    return {
        'rouge1_precision': rouge_scores['rouge1'].precision,
        'rouge1_recall': rouge_scores['rouge1'].recall,
        'rouge1_f1': rouge_scores['rouge1'].fmeasure,
        'rouge2_f1': rouge_scores['rouge2'].fmeasure,
        'rougeL_f1': rouge_scores['rougeL'].fmeasure,
        'bleu_score': bleu,
        'predicted/reference_length_ratio': length_ratio,
        'predicted_length': pred_length,
        'reference_length': ref_length,
    }

def create_and_log_visualizations(metrics_list: List[Dict[str, float]]):
    """Create and log visualizations to MLflow"""
    # Convert metrics to DataFrame
    df = pd.DataFrame(metrics_list)
    
    # ROUGE scores comparison
    plt.figure(figsize=(10, 6))
    rouge_metrics = ['rouge1_f1', 'rouge2_f1', 'rougeL_f1']
    df[rouge_metrics].mean().plot(kind='bar')
    plt.title('Average ROUGE Scores')
    plt.ylabel('Score')
    plt.tight_layout()
    plt.savefig('rouge_scores.png')
    mlflow.log_artifact('rouge_scores.png')
    plt.close()
    
    # Summary length analysis
    plt.figure(figsize=(10, 6))
    plt.scatter(df['reference_length'], df['predicted_length'])
    plt.plot([0, max(df['reference_length'])], [0, max(df['reference_length'])], '--', color='red')
    plt.xlabel('Reference response Length')
    plt.ylabel('Predicted response Length')
    plt.title('Response Length Comparison')
    plt.tight_layout()
    plt.savefig('length_comparison.png')
    mlflow.log_artifact('length_comparison.png')
    plt.close()
    
    # Metrics distribution
    plt.figure(figsize=(12, 6))
    metrics_to_plot = ['rouge1_f1', 'rouge2_f1', 'rougeL_f1', 'bleu_score']
    df[metrics_to_plot].boxplot()
    plt.title('Distribution of Evaluation Metrics')
    plt.ylabel('Score')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('metrics_distribution.png')
    mlflow.log_artifact('metrics_distribution.png')
    plt.close()

## Prepare dataset for evalaution

In [0]:
def prepare_test_data() -> List[Dict[str, str]]:
    """Prepare test data with YouTube videos and reference summaries"""
    # Replace with your actual test data
    return [
        {
            "question": "que es epm en colombia",
            "answer": "Es una empresa colombiana de servicios públicos domiciliarios fundada el 6 de agosto de 1955 en Medellín. EPM ofrece servicios de energía eléctrica, gas natural, agua potable, saneamiento básico y telecomunicaciones. Es propiedad del municipio de Medellín y tiene presencia en varias regiones de Colombia y otros países de América Latina."
        },
        {
            "question": "why databricks can be written as dbx",
            "answer": """Databricks can be abbreviated as DBX because it is a shorthand or mnemonic derived from the company’s name, following a common convention in technology and branding:
	1.	“DB” for Databricks: The “DB” part directly represents “Data” and “Bricks,” reflecting the name of the platform.
	2.	“X” for Flexibility or Multiplicity:
	•	The “X” is often used in technology branding to imply scalability, versatility, or cutting-edge innovation.
	•	In some cases, “X” also represents a short, dynamic character to make abbreviations look modern and tech-savvy.
	3.	Compact Branding:
	•	Using DBX is easier to write, remember, and type than the full name “Databricks.”
	•	Many organizations and products adopt similar shortened names (e.g., GCP for Google Cloud Platform or AWS for Amazon Web Services).

Thus, DBX is a convenient, modern shorthand for Databricks, widely used in casual and technical communication."""
        },
        {
            "question": "what is an LLM Agent",
            "answer": "An LLM Agent is a software or application framework that uses a Large Language Model (LLM) as a core component to perform complex tasks by reasoning, generating content, or interacting with users or systems. LLM Agents typically extend the capabilities of an LLM to incorporate structured decision-making, external tool usage, and contextual adaptability."
        }
    ]

test_data = prepare_test_data()

## Experiments

### Experiment 1: Basic prompt

In [0]:
prompt_template = """
Answer the following question: {question}
"""
model_name = "databricks-meta-llama-3-1-70b-instruct"

chain = myChain(prompt_template=prompt_template, model_name=model_name)

run_id, model_uri = log_chain_to_mlflow(chain=chain, 
                             wrapper=myChainWrapper, experiment_name="/Users/guillermo.angarita.gutierrez@gmail.com/example-chain")

print(f"Chain logged with run_id: {run_id}")

# Load the chain
ml_flow_chain = mlflow.pyfunc.load_model(model_uri)

2024/11/21 10:35:25 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

2024/11/21 10:35:26 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/11/21 10:35:26 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


🏃 View run delicate-slug-897 at: https://dbc-44a2cc4b-08f0.cloud.databricks.com/ml/experiments/2444381359140680/runs/16a37ed1e9c74919bd193459f2f92a80
🧪 View experiment at: https://dbc-44a2cc4b-08f0.cloud.databricks.com/ml/experiments/2444381359140680
Chain logged with run_id: 16a37ed1e9c74919bd193459f2f92a80


Downloading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

In [0]:
# Prepare test data
test_data = prepare_test_data()
experiment_name = "/Users/guillermo.angarita.gutierrez@gmail.com/example-chain"
# Run evaluation
run_id, avg_metrics = evaluate_model_with_mlflow(ml_flow_chain, test_data, experiment_name)

print("\nEvaluation Results:")
print("==================")
for metric, value in avg_metrics.items():
    print(f"{metric}: {value:.4f}")

print(f"\nMLflow run ID: {run_id}")
print("View detailed results in the MLflow UI")

2024/11/21 10:36:46 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
2024/11/21 10:36:46 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/11/21 10:36:46 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


Error processing example 0: 'str' object has no attribute 'input_variables'
Error processing example 1: 'str' object has no attribute 'input_variables'
Error processing example 2: 'str' object has no attribute 'input_variables'
No successful evaluations completed
🏃 View run model_evaluation at: https://dbc-44a2cc4b-08f0.cloud.databricks.com/ml/experiments/2444381359140680/runs/ed6f3cf77f88427b8741e966f31a43eb
🧪 View experiment at: https://dbc-44a2cc4b-08f0.cloud.databricks.com/ml/experiments/2444381359140680

Evaluation Results:


[0;31m---------------------------------------------------------------------------[0m
[0;31mAttributeError[0m                            Traceback (most recent call last)
File [0;32m<command-2642639785184990>, line 9[0m
[1;32m      7[0m [38;5;28mprint[39m([38;5;124m"[39m[38;5;130;01m\n[39;00m[38;5;124mEvaluation Results:[39m[38;5;124m"[39m)
[0;32m----> 9[0m [38;5;28;01mfor[39;00m metric, value [38;5;129;01min[39;00m avg_metrics[38;5;241m.[39mitems():
[1;32m     10[0m     [38;5;28mprint[39m([38;5;124mf[39m[38;5;124m"[39m[38;5;132;01m{[39;00mmetric[38;5;132;01m}[39;00m[38;5;124m: [39m[38;5;132;01m{[39;00mvalue[38;5;132;01m:[39;00m[38;5;124m.4f[39m[38;5;132;01m}[39;00m[38;5;124m"[39m)
[1;32m     12[0m [38;5;28mprint[39m([38;5;124mf[39m[38;5;124m"[39m[38;5;130;01m\n[39;00m[38;5;124mMLflow run ID: [39m[38;5;132;01m{[39;00mrun_id[38;5;132;01m}[39;00m[38;5;124m"[39m)

[0;31mAttributeError[0m: 'NoneType' object has no attribu

In [0]:
avg_metrics

{'avg_rouge1_precision': 0.39406062449871393,
 'avg_rouge1_recall': 0.4925566138426358,
 'avg_rouge1_f1': 0.40309388650149436,
 'avg_rouge2_f1': 0.1717194006790708,
 'avg_rougeL_f1': 0.2577484896838709,
 'avg_bleu_score': 0.02905379028174043,
 'avg_predicted/reference_length_ratio': 1.8229598341304758,
 'avg_predicted_length': 131.0,
 'avg_reference_length': 81.0}

# Register best model

In [0]:
# Log the final model to Databricks Model Registry
model_name = "llm_qa_chain"
model_version = mlflow.register_model(
    model_uri=model_uri,
    name=model_name  # Remove the "models:/" prefix
)

# Add description and tags
client = mlflow.tracking.MlflowClient()
client.update_registered_model(
    name=model_name,  # Use simple model name
    description="Question-Answering Chain using Llama 3 70B"
)

# Add version-specific tags
client.update_model_version(
    name=model_name,  # Use simple model name
    version=model_version.version,
    description="Basic QA chain with simple prompt template"
)

print(f"Model registered as: {model_name}")
print(f"Model version: {model_version.version}")

Registered model 'llm_qa_chain' already exists. Creating a new version of this model...


Downloading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

Created version '1' of model 'llmops_class.default.llm_qa_chain'.


Model registered as: llm_qa_chain
Model version: 1
