In [1]:
pip install llama-cpp-python

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.9.tar.gz (67.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.9/67.9 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
Building wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25ldone
[?25h  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.3.9-cp310-cp310-linux_x86_64.whl size=4065595 sha256=c8f876d795de60f4bbdb63763a84afea809b5e8a8d71aff6190489acf61a

In [2]:
pip install mlflow

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting mlflow
  Downloading mlflow-2.22.0-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.22.0 (from mlflow)
  Downloading mlflow_skinny-2.22.0-py3-none-any.whl.metadata (31 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.1-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting sqlalchemy<3,>=1.4.0 (from mlflow)
  Downloading sqlalchemy-2.0.41-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.22.0->mlflow)
  Downloading databricks_sdk-0.55.0-py3-none-any.whl.metadata (39 kB)
Collecting fas

In [14]:
# Standard Library Imports
import os
import logging
import warnings
from pathlib import Path

# Third-Party Libraries
import pandas as pd
from llama_cpp import Llama # Your core library

# MLflow for Experiment Tracking and Model Management
import mlflow
import mlflow.pyfunc
from mlflow import MlflowClient
from mlflow.models.signature import ModelSignature
from mlflow.types.schema import Schema, ColSpec

In [15]:
warnings.filterwarnings("ignore")
# Potentially suppress llama_cpp verbosity if needed, though it's usually less verbose

# Create logger
logger = logging.getLogger("llama_cpp_logger") #
logger.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s",
                              datefmt="%Y-%m-%d %H:%M:%S")
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)
logger.propagate = False

In [40]:
# --- Define paths and names for your Llama CPP model ---
# This is the GGUF model file you provided earlier.
# Ensure this path is correct and the file exists where your script runs.
LLAMA_MODEL_FILE_PATH = "../zephyr-quiklang-3b.Q4_K_M.gguf"
# This is how the GGUF file will be named *inside* the MLflow model's artifacts directory
LLAMA_MODEL_ARTIFACT_NAME = "zephyr-quiklang-3b.Q4_K_M.gguf"

# MLflow Configurations
EXPERIMENT_NAME_LLAMA = "LlamaCPP_TypingGame_Experiment" 
RUN_NAME_LLAMA = "LlamaCPP_TypingGame_Run" 
REGISTERED_MODEL_NAME_LLAMA = "LlamaCPP_TypingGame_Generator"

In [41]:
logger.info('LlamaCPP script execution started.') 

2025-05-30 19:08:52 - INFO - LlamaCPP script execution started.
2025-05-30 19:08:52 - INFO - LlamaCPP script execution started.


In [42]:
def log_asset_status_llama(asset_path: str, asset_name: str, failure_message: str) -> None: #
    if Path(asset_path).exists():
        logger.info(f"{asset_name} found at {asset_path}.") #
    else:
        logger.error(f"{asset_name} not found at {asset_path}. {failure_message}") #
        raise FileNotFoundError(f"{asset_name} not found at {asset_path}. {failure_message}")

log_asset_status_llama(
    asset_path=LLAMA_MODEL_FILE_PATH,
    asset_name="Llama GGUF Model",
    failure_message="Please ensure the GGUF model file is at the specified path."
)

2025-05-30 19:08:53 - INFO - Llama GGUF Model found at ../zephyr-quiklang-3b.Q4_K_M.gguf.
2025-05-30 19:08:53 - INFO - Llama GGUF Model found at ../zephyr-quiklang-3b.Q4_K_M.gguf.


In [43]:
class LlamaTypingGameModel(mlflow.pyfunc.PythonModel):
    def load_context(self, context):
        """
        Load the Llama GGUF model from the artifacts.
        """
        model_file_path_in_artifacts = context.artifacts[LLAMA_MODEL_ARTIFACT_NAME] #
        logger.info(f"Loading Llama GGUF model from: {model_file_path_in_artifacts}") #

        self.llm = Llama(
            model_path=model_file_path_in_artifacts, #
            n_ctx=16384,
            n_threads=8,
            n_gpu_layers=35 # Set to 0 if deploying to CPU-only environment or if GPU causes issues
        )
        logger.info("Llama GGUF model loaded successfully.") #

    def _generate_text(self, prompt: str, max_tokens: int = 20, stop_conditions: list = ["</s>"], echo_results: bool = False): #
        """
        Internal method to generate text using the loaded Llama model.
        """
        output = self.llm(
            prompt,
            max_tokens=max_tokens,
            stop=stop_conditions,
            echo=echo_results
        )
        return output["choices"][0]["text"] #

    def predict(self, context, model_input, params=None):
        """
        Generate sentences based on input prompts.
        :param model_input: A pandas DataFrame with a 'prompt' column.
        :param params: Optional dictionary for runtime parameters (e.g., max_tokens).
                       Not used in this basic version but good for extensibility.
        """
        logger.info(f"Received model_input: {model_input}") #
        prompts = model_input["prompt"]
        generated_texts = []

        # Extract parameters if provided, otherwise use defaults
        max_tokens = params.get("max_tokens", 20) if params else 20 #
        
        for p in prompts:
            try:
                text = self._generate_text(p, max_tokens=max_tokens) #
                generated_texts.append(text)
            except Exception as e:
                logger.error(f"Error generating text for prompt '{p}': {e}") #
                generated_texts.append(f"Error: Could not generate text. {e}") #

        return pd.DataFrame({"generated_text": generated_texts}) #

    @classmethod
    def log_to_mlflow(cls, experiment_name: str, run_name: str, registered_model_name: str, gguf_model_local_path: str): #
        """
        Logs the LlamaCPP model to MLflow, including registration.
        """
        logger.info(f"Starting MLflow logging for experiment: {experiment_name}") #
        mlflow.set_experiment(experiment_name=experiment_name)

        with mlflow.start_run(run_name=run_name) as run:
            logger.info(f"MLflow Run ID: {run.info.run_id}") #
            logger.info(f"Run's Artifact URI: {run.info.artifact_uri}") #

            # Define input and output schema
            input_schema = Schema([ColSpec("string", "prompt")]) #
            output_schema = Schema([ColSpec("string", "generated_text")]) #
            
            # Optional: Define parameters schema if you want to control things like max_tokens via params
            #params_schema = ParamSchema([ParamSpec("max_tokens", "integer", 20, None)]) #
            #signature = ModelSignature(inputs=input_schema, outputs=output_schema, params=params_schema) #

            # Define artifacts to be packaged with the model
            # The key is how it's accessed in load_context, value is the local path.
            artifacts_to_log = {
                LLAMA_MODEL_ARTIFACT_NAME: gguf_model_local_path #
            }
            logger.info(f"Logging model with artifacts: {artifacts_to_log}") #

            # Define pip requirements for the model's environment
            # This is crucial for HP AI Studio to build the correct serving environment.
            pip_requirements = [
                "pandas",
                "llama-cpp-python" # Add specific version if necessary, e.g., "llama-cpp-python==0.2.20"
                # "mlflow" # MLflow itself is usually part of the serving environment already
            ]

            mlflow.pyfunc.log_model(
                artifact_path=registered_model_name, # This becomes a sub-path in the run's artifact URI
                python_model=cls(),
                artifacts=artifacts_to_log,
                pip_requirements=pip_requirements,
                #signature=signature,
                # code_path = [] # Optionally include other Python files if your class relies on them
                input_example=pd.DataFrame({"prompt": ["Write a sentence designed to test typing speed."]}) #
            )
            logger.info(f"Model '{registered_model_name}' logged to run {run.info.run_id}") #

            # Register the logged model in MLflow Model Registry (as in MRTEMP.py)
            model_uri = f"runs:/{run.info.run_id}/{registered_model_name}" #
            mlflow.register_model(
                model_uri=model_uri,
                name=registered_model_name
            )
            logger.info(f"Registered model '{registered_model_name}' with URI: {model_uri}") #
        
        logger.info(f"MLflow logging and registration for '{registered_model_name}' complete.") #
        return model_uri # Return the model URI for convenience



In [None]:
    # Log and register the LlamaTypingGameModel
    LlamaTypingGameModel.log_to_mlflow(
        experiment_name=EXPERIMENT_NAME_LLAMA,
        run_name=RUN_NAME_LLAMA,
        registered_model_name=REGISTERED_MODEL_NAME_LLAMA,
        gguf_model_local_path=LLAMA_MODEL_FILE_PATH
    )

2025-05-30 19:23:15 - INFO - Starting MLflow logging for experiment: LlamaCPP_TypingGame_Experiment
2025-05-30 19:23:15 - INFO - Starting MLflow logging for experiment: LlamaCPP_TypingGame_Experiment
2025-05-30 19:23:15 - INFO - MLflow Run ID: 45e649aa756c47c4b1014ec1f998e02b
2025-05-30 19:23:15 - INFO - MLflow Run ID: 45e649aa756c47c4b1014ec1f998e02b
2025-05-30 19:23:15 - INFO - Run's Artifact URI: /phoenix/mlflow/851378688990988137/45e649aa756c47c4b1014ec1f998e02b/artifacts
2025-05-30 19:23:15 - INFO - Run's Artifact URI: /phoenix/mlflow/851378688990988137/45e649aa756c47c4b1014ec1f998e02b/artifacts
2025-05-30 19:23:15 - INFO - Logging model with artifacts: {'zephyr-quiklang-3b.Q4_K_M.gguf': '../zephyr-quiklang-3b.Q4_K_M.gguf'}
2025-05-30 19:23:15 - INFO - Logging model with artifacts: {'zephyr-quiklang-3b.Q4_K_M.gguf': '../zephyr-quiklang-3b.Q4_K_M.gguf'}
2025-05-30 19:23:15 - INFO - Loading Llama GGUF model from: ../zephyr-quiklang-3b.Q4_K_M.gguf
2025-05-30 19:23:15 - INFO - Loading

In [None]:
    logger.info(f"Fetching the latest version of model: {REGISTERED_MODEL_NAME_LLAMA}") #
    client = MlflowClient()
    try:
        model_metadata = client.get_latest_versions(REGISTERED_MODEL_NAME_LLAMA, stages=["None"]) #
        if not model_metadata:
            model_metadata = client.get_latest_versions(REGISTERED_MODEL_NAME_LLAMA) # Try without stage if "None" yields nothing
        
        if model_metadata:
            latest_model_version = model_metadata[0].version
            logger.info(f"Latest Model Version: {latest_model_version}") #

            model_uri_for_loading = f"models:/{REGISTERED_MODEL_NAME_LLAMA}/{latest_model_version}" #
            logger.info(f"Loading model from URI: {model_uri_for_loading}") #
            loaded_model = mlflow.pyfunc.load_model(model_uri=model_uri_for_loading) #

            sample_query_df = pd.DataFrame({"prompt": ["Generate a short sentence."]}) #
            logger.info(f"Running prediction with sample query: {sample_query_df}") #
            
            # Example of passing parameters at prediction time
            prediction_params = {"max_tokens": 15}
            result = loaded_model.predict(sample_query_df, params=prediction_params) #
            
            logger.info("Prediction Result:") #
            logger.info(result.to_string()) #
        else:
            logger.error(f"Could not find any versions for model '{REGISTERED_MODEL_NAME_LLAMA}'.") #

    except Exception as e:
        logger.error(f"Error fetching or testing the model: {e}") #

    logger.info('LlamaCPP script execution completed.') #