# 01-01 : Self Container Llamafile Example

Tested on AWS SageMaker with an `ml.t3.xlarge` instance.

```bash
conda create -n llamfile python=3.10
```

In [1]:
model_quantization='q4'
model = f'llava-v1.5-7b'
model_file = f'{model}-{model_quantization}.llamafile'
model_url = f'https://huggingface.co/Mozilla/{model}-llamafile/resolve/main/{model_file}?download=true'
print(model_url)

https://huggingface.co/Mozilla/llava-v1.5-7b-llamafile/resolve/main/llava-v1.5-7b-q4.llamafile?download=true


## 0. Install Dependencies

In [None]:
!pip install tqdm requests openai==1.35.14

### 0.1. Download Llamafile

In [2]:
import requests
from tqdm import tqdm

headers = {
    "Authorization": "Bearer hf_itCGxKwnTmhwCGcxvDgfciRyniKMYeZAwhatPk"
}

response = requests.get(model_url, headers=headers, stream=True)

if response.status_code == 200:
    total_size = int(response.headers.get('content-length', 0))
    block_size = 1024  # 1 Kilobyte
    progress_bar = tqdm(total=total_size, unit='iB', unit_scale=True)
    
    with open(model_file, "wb") as file:
        for data in response.iter_content(block_size):
            progress_bar.update(len(data))
            file.write(data)
    progress_bar.close()
else:
    print(f"Failed to download file: {response.status_code}")

100%|██████████| 4.29G/4.29G [07:30<00:00, 9.52MiB/s]


## 1. Imports

In [3]:
import os
import subprocess
import stat
from typing import Optional
from openai import OpenAI

## 2. Run Llamafile

In [4]:
# Get the current working directory
cwd = os.getcwd()

# Construct the full path to the executable
executable_path = os.path.join(cwd, model_file)

# Ensure the file has execute permissions
if not os.access(executable_path, os.X_OK):
    st = os.stat(executable_path)
    os.chmod(executable_path, st.st_mode | stat.S_IEXEC)

# Define the arguments separately
arguments = ['--port', '8081', '--host', '0.0.0.0', '--nobrowser']

# Start the executable in the background
llamafile_process = subprocess.Popen(['bash', executable_path] + arguments)
print("Executable started in the background.")

Executable started in the background.


{"build":1500,"commit":"a30b324","function":"server_cli","level":"INFO","line":2869,"msg":"build info","tid":"10733792","timestamp":1721292132}
{"function":"server_cli","level":"INFO","line":2872,"msg":"system info","n_threads":8,"n_threads_batch":-1,"system_info":"AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | ","tid":"10733792","timestamp":1721292132,"total_threads":16}
{"function":"load_model","level":"INFO","line":435,"msg":"Multi Modal Mode Enabled","tid":"10733792","timestamp":1721292132}


note: if you have an AMD or NVIDIA GPU then you need to pass -ngl 9999 to enable GPU offloading
clip_model_load: model name:   openai/clip-vit-large-patch14-336
clip_model_load: description:  image encoder for LLaVA
clip_model_load: GGUF version: 3
clip_model_load: alignment:    32
clip_model_load: n_tensors:    377
clip_model_load: n_kv:         19
clip_model_load: ftype:        q4_0
clip_model_load: loaded meta data with 19 key-value pairs and 377 tensors from llava-v1.5-7b-mmproj-Q4_0.gguf
clip_model_load: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
clip_model_load: - kv   0:                       general.architecture str              = clip
clip_model_load: - kv   1:                      clip.has_text_encoder bool             = false
clip_model_load: - kv   2:                    clip.has_vision_encoder bool             = true
clip_model_load: - kv   3:                   clip.has_llava_projector bool             = true
clip_model_load: - kv   4:    

{"function":"initialize","level":"INFO","line":489,"msg":"initializing slots","n_slots":1,"tid":"10733792","timestamp":1721292133}
{"function":"initialize","level":"INFO","line":498,"msg":"new slot","n_ctx_slot":2048,"slot_id":0,"tid":"10733792","timestamp":1721292133}
{"function":"server_cli","level":"INFO","line":3090,"msg":"model loaded","tid":"10733792","timestamp":1721292133}
In the sandboxing block!
{"function":"server_cli","hostname":"0.0.0.0","level":"INFO","line":3213,"msg":"HTTP server listening","port":"8081","tid":"10733792","timestamp":1721292133}
{"function":"update_slots","level":"INFO","line":1659,"msg":"all slots are idle and system prompt is empty, clear the KV cache","tid":"10733792","timestamp":1721292133}


## 3. Create LLM Client

In [5]:
client = OpenAI(
    base_url="http://localhost:8081/v1",
    api_key = "sk-no-key-required"
)

## 4. Test LLM

In [6]:
completion = client.chat.completions.create(
    model="LLaMA_CPP",
    messages=[
        {"role": "system", "content": "You are an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."},
        {"role": "user", "content": "Why is the sky blue"}
    ]
)
print(completion.choices[0].message)

{"function":"launch_slot_with_data","level":"INFO","line":884,"msg":"slot is processing task","slot_id":0,"task_id":0,"tid":"10733792","timestamp":1721292142}
{"function":"update_slots","level":"INFO","line":1910,"msg":"kv cache rm [p0, end)","p0":0,"slot_id":0,"task_id":0,"tid":"10733792","timestamp":1721292142}
{"function":"print_timings","level":"INFO","line":313,"msg":"prompt eval time     =     873.78 ms /    73 tokens (   11.97 ms per token,    83.54 tokens per second)","n_tokens_second":83.54457904404401,"num_prompt_tokens_processed":73,"slot_id":0,"t_prompt_processing":873.785,"t_token":11.969657534246576,"task_id":0,"tid":"10733792","timestamp":1721292156}
{"function":"print_timings","level":"INFO","line":327,"msg":"generation eval time =   12334.03 ms /   111 runs   (  111.12 ms per token,     9.00 tokens per second)","n_decoded":111,"n_tokens_second":8.999491650336507,"slot_id":0,"t_token":111.1173873873874,"t_token_generation":12334.03,"task_id":0,"tid":"10733792","timestam

{"function":"log_server_request","level":"INFO","line":2794,"method":"GET","msg":"request","params":{},"path":"/","remote_addr":"127.0.0.1","remote_port":59018,"status":200,"tid":"129650367519744","timestamp":1721292187}
{"function":"log_server_request","level":"INFO","line":2794,"method":"GET","msg":"request","params":{},"path":"/completion.js","remote_addr":"127.0.0.1","remote_port":59022,"status":200,"tid":"129650367520096","timestamp":1721292187}{"function":"log_server_request","level":"INFO","line":2794,"method":"GET","msg":"request","params":{},"path":"/index.js","remote_addr":"127.0.0.1","remote_port":59018,"status":200,"tid":"129650367519744","timestamp":1721292187}

{"function":"log_server_request","level":"INFO","line":2794,"method":"GET","msg":"request","params":{},"path":"/json-schema-to-grammar.mjs","remote_addr":"127.0.0.1","remote_port":59038,"status":200,"tid":"129650367520448","timestamp":1721292187}
{"function":"log_server_request","level":"INFO","line":2794,"method":

## 10. Stop Llamafile

In [7]:
llamafile_process.terminate()