## Testing the speed difference between ExLlamaV2 vs Orthodox Transformer

In [None]:
!apt update

In [None]:
!pip install --upgrade transformers safetensors sentencepiece huggingface-hub protobuf accelerate bitsandbytes tqdm openai backoff retrying ipykernel ipywidgets matplotlib cmake scikit-build-core setuptools

In [None]:
!pip install "numpy<2.3"

In [None]:
!pip install vllm

In [None]:
!pip install llama-cpp-python \
    --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121 \
    --force-reinstall

In [None]:
#!pip install --no-cache-dir --upgrade torch torchvision --index-url https://download.pytorch.org/whl/cu128

In [None]:
!apt install git-lfs

In [None]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"
# ######### Also useful to reduce thread contention:
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["GIT_LFS_SKIP_SMUDGE"] = "1"

In [None]:
!git lfs install

In [None]:
!git clone https://huggingface.co/Qwen/Qwen2.5-7B-Instruct LLM_MODELS/Qwen2.5-7B-Instruct

In [None]:
!git clone https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GGUF LLM_MODELS_GGUF/Qwen2.5-7B-Instruct-GGUF

In [None]:
!cd LLM_MODELS_GGUF/Qwen2.5-7B-Instruct-GGUF && git lfs pull --include "qwen2.5-7b-instruct-q4_k_m*.gguf"

In [None]:
# 1. Download the official pre-built binaries (Ubuntu x64)
print("Downloading llama.cpp CLI tools...")
!wget -q https://github.com/ggml-org/llama.cpp/releases/download/b7134/llama-b7134-bin-ubuntu-x64.zip -O llama_tools.zip

# 2. Unzip into a specific folder
print("Extracting...")
!unzip -o -q llama_tools.zip -d llama_tools_bin

# 3. Make the splitter executable
tool_path = os.path.abspath("llama_tools_bin/build/bin/llama-gguf-split")
!chmod +x {tool_path}

# 4. Add to PATH environment variable so you can use '!' commands
if "llama_tools_bin/build/bin" not in os.environ["PATH"]:
    os.environ["PATH"] += os.pathsep + os.path.dirname(tool_path)

print(f"Installed tools to: {os.path.dirname(tool_path)}")

# 5. Verify it works
print("\nVerifying llama-gguf-split version:")
!llama-gguf-split --help | head -n 5

In [None]:
!llama-gguf-split --merge LLM_MODELS_GGUF/Qwen2.5-7B-Instruct-GGUF/qwen2.5-7b-instruct-q4_k_m-00001-of-00002.gguf LLM_MODELS_GGUF/Qwen2.5-7B-Instruct-GGUF/qwen2.5-7b-instruct-q4_k_m.gguf

In [None]:
from misc.speed_test_backend import BenchmarkConfig, run_benchmark

config = BenchmarkConfig(
    model_path="LLM_MODELS/Qwen2.5-7B-Instruct",
    backend="hf",
    quantize_4bit=True,
)

results = run_benchmark(config)

config = BenchmarkConfig(
    model_path="LLM_MODELS/Qwen2.5-7B-Instruct",
    backend="vllm",
    quantization="fp8",
)

results = run_benchmark(config)

config = BenchmarkConfig(
    model_path="LLM_MODELS_GGUF/Qwen2.5-7B-Instruct-GGUF/qwen2.5-7b-instruct-q4_k_m.gguf",
    backend="llamacpp",
)

results = run_benchmark(config)

### On specific dataset and prompt

In [None]:
os.environ["LLM_BACKEND"] = "hf"

snapshot_path = "LLM_MODELS/Qwen2.5-7B-Instruct"
#snapshot_path = "/workspace/LLM_MODELS/Llama-SEA-LION-v3-8B-IT"

os.environ["LOCAL_MODEL_PATH"] = snapshot_path
os.environ["LLM_MODEL"] = snapshot_path

######## enable 4-bit for quants (and bitsandbytes is set up)
os.environ["LLM_LOAD_IN_4BIT"] = "1"  # or "0" to disable quantization
print("LOCAL_MODEL_PATH =", os.environ["LOCAL_MODEL_PATH"])
print("LLM_MODEL =", os.environ["LLM_MODEL"])

### If kernel doesnt recognize
LLM_MODEL=snapshot_path
LOCAL_MODEL_PATH=snapshot_path


In [None]:
# Commandline args universal
# MAX_NEW_TOKENS is purely for text generation count limit while max_position_embeddings is for context_length based on LLM config.json. !!! input_length + MAX_NEW_TOKENS shopuld be < context_length, otherwise LLM breaks. Llama 3 only has 8k context length/max_posiiton_embedding. SEALIONv3-LLama3-8B-IT uses ROPE, max_position_embeddings follows ROPE limit 131k, Qwen2.5-7B-IT has 32k context length, SahabatAIv1-LLama3-8B-IT has 8k context length.
# Counted the response for each steps in notebook output cell with tokens counter online, translations ~400 tokens, ~decomposition ~500 tokens, search_resolve ~700 tokens
# Change this every process (translate, decompose, search_resolve), different value is needed. Time is in seconds.
os.environ["LLM_WORKER_MAX_TIME"] = "300"
LLM_WORKER_MAX_TIME=300
os.environ["MAX_NEW_TOKENS"] = "1200"
MAX_NEW_TOKENS=1200
os.environ["BATCH_NUM"] = "1"
BATCH_NUM=1

print("LLM_WORKER_MAX_TIME =", os.environ["LLM_WORKER_MAX_TIME"])
print("MAX_NEW_TOKENS =", os.environ["MAX_NEW_TOKENS"])
print("BATCH_NUM =", os.environ["BATCH_NUM"])

In [None]:
# Naive prompting only requires True or False answer based on context
os.environ["MAX_NEW_TOKENS"] = "600"
MAX_NEW_TOKENS=600
os.environ["BATCH_NUM"] = "1"
BATCH_NUM=1

In [None]:
# Solving with naive prompting with explanations only
!python naive_prompting.py --data_path results_bahasa_translation --dataset_name ProntoQA --sample_pct 1 --prompts_folder manual_prompts_translated --prompts_file naive_prompting_explanations_before_answer --split dev --save_path results_translated_naive_prompting/prompt_explanations_before_answer --model_name $LLM_MODEL --batch_num $BATCH_NUM --max_new_tokens $MAX_NEW_TOKENS

In [None]:
!python evaluate.py --dataset_name ProntoQA --save_path results_translated_naive_prompting/prompt_explanations_before_answer --model_name $LLM_MODEL --evaluation_method naive_prompting

In [None]:
# Setting to llamacpp backend
os.environ["LLM_BACKEND"] = "llamacpp"

snapshot_path = "LLM_MODELS_GGUF/Qwen2.5-7B-Instruct-GGUF/qwen2.5-7b-instruct-q4_k_m.gguf"
#snapshot_path = "/workspace/LLM_MODELS_EXL2/Llama-SEA-LION-v3-8B-IT-EXL2-Indonesia-Focus"

os.environ["LOCAL_MODEL_PATH"] = snapshot_path
os.environ["LLM_MODEL"] = snapshot_path

######## enable 4-bit for quants (and bitsandbytes is set up)
print("LOCAL_MODEL_PATH =", os.environ["LOCAL_MODEL_PATH"])
print("LLM_MODEL =", os.environ["LLM_MODEL"])

### If kernel doesnt recognize
LLM_MODEL=snapshot_path
LOCAL_MODEL_PATH=snapshot_path

In [None]:
# Solving with naive prompting with explanations with llamacpp backend
!python naive_prompting.py --data_path results_bahasa_translation --dataset_name ProntoQA --sample_pct 1 --prompts_folder manual_prompts_translated --prompts_file naive_prompting_explanations_before_answer --split dev --save_path results_translated_naive_prompting/prompt_explanations_before_answer --model_name $LLM_MODEL --batch_num $BATCH_NUM --max_new_tokens $MAX_NEW_TOKENS

In [None]:
!python evaluate.py --dataset_name ProntoQA --save_path results_translated_naive_prompting/prompt_explanations_before_answer --model_name $LLM_MODEL --evaluation_method naive_prompting

In [None]:
# Setting to vllm backend
os.environ["LLM_BACKEND"] = "vllm"

snapshot_path = "LLM_MODELS/Qwen2.5-7B-Instruct"
#snapshot_path = "/workspace/LLM_MODELS/Llama-SEA-LION-v3-8B-IT"

os.environ["LOCAL_MODEL_PATH"] = snapshot_path
os.environ["LLM_MODEL"] = snapshot_path

print("LOCAL_MODEL_PATH =", os.environ["LOCAL_MODEL_PATH"])
print("LLM_MODEL =", os.environ["LLM_MODEL"])

### If kernel doesnt recognize
LLM_MODEL=snapshot_path
LOCAL_MODEL_PATH=snapshot_path

In [None]:
# Solving with naive prompting with explanations with vllm backend
!python naive_prompting.py --data_path results_bahasa_translation --dataset_name ProntoQA --sample_pct 1 --prompts_folder manual_prompts_translated --prompts_file naive_prompting_explanations_before_answer --split dev --save_path results_translated_naive_prompting/prompt_explanations_before_answer --model_name $LLM_MODEL --batch_num $BATCH_NUM --max_new_tokens $MAX_NEW_TOKENS

In [None]:
!python evaluate.py --dataset_name ProntoQA --save_path results_translated_naive_prompting/prompt_explanations_before_answer --model_name $LLM_MODEL --evaluation_method naive_prompting