## Testing the speed difference between ExLlamaV2 vs Orthodox Transformer

In [None]:
!apt update

In [None]:
!pip install --upgrade transformers safetensors sentencepiece huggingface-hub protobuf accelerate bitsandbytes tqdm openai backoff retrying ipykernel ipywidgets matplotlib exllamav2 cmake scikit-build-core setuptools

In [None]:
!pip install vllm

In [None]:
!CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python --force-reinstall --upgrade

In [None]:
#!pip install --no-cache-dir --upgrade torch torchvision --index-url https://download.pytorch.org/whl/cu128

In [None]:
!apt install git-lfs

In [None]:
!git lfs install

In [None]:
%%bash
# Prepare a clean directory for tools
mkdir -p tools
cd tools

# Remove old repo if it exists to avoid conflicts
rm -rf llama.cpp

# Clone and Compile
echo "Cloning llama.cpp..."
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp

echo "Compiling split tool..."
cmake -B build -DGGML_CUDA=ON
cmake --build build --config Release

In [None]:
!git clone https://huggingface.co/Qwen/Qwen2.5-7B-Instruct LLM_MODELS/Qwen2.5-7B-Instruct

In [None]:
!git clone https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GGUF LLM_MODELS_GGUF/Qwen2.5-7B-Instruct-GGUF --include "qwen2.5-7b-instruct-q4_k_m*.gguf"

In [None]:
!llama-gguf-split --merge qwen2.5-7b-instruct-q4_k_m-00001-of-00002.gguf qwen2.5-7b-instruct-q4_k_m.gguf

In [None]:
import subprocess
import sys
import torch

# Configuration
MODEL_PATH = "LLM_MODELS/Qwen2.5-7B-Instruct" 
GGUF_PATH = "LLM_MODELS_GGUF/Qwen2.5-7B-Instruct-GGUF/qwen2.5-7b-instruct-q4_k_m.gguf"
SCRIPT_PATH = "misc/speed_test_backend.py"

benchmarks = [
    # (Backend, Model Path, Quantize Flag)
    ("hf", MODEL_PATH, "--quantize_4bit"),
    ("llamacpp", GGUF_PATH, ""),
    ("vllm", MODEL_PATH, "")
]

print(f"Starting Sequential Benchmark on {torch.cuda.get_device_name(0)}...")

for backend, path, flags in benchmarks:
    print(f"\n{'='*40}")
    print(f"Running: {backend.upper()}")
    print(f"{'='*40}")
    
    # Construct command
    cmd = [
        sys.executable, SCRIPT_PATH,
        "--backend", backend,
        "--model_path", path,
        "--max_new_tokens", "300"
    ]
    if flags:
        cmd.append(flags)
        
    # Run as separate process (Wipes VRAM on completion)
    try:
        subprocess.run(cmd, check=True)
    except subprocess.CalledProcessError as e:
        print(f"Benchmark {backend} failed with error: {e}")

print("\nAll benchmarks complete.")

### On specific dataset and prompt

In [None]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"
# ######### Also useful to reduce thread contention:
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"

snapshot_path = "LLM_MODELS/Qwen2.5-7B-Instruct"
#snapshot_path = "/workspace/LLM_MODELS/Llama-SEA-LION-v3-8B-IT"

os.environ["LOCAL_MODEL_PATH"] = snapshot_path
os.environ["LLM_MODEL"] = snapshot_path

######## enable 4-bit for quants (and bitsandbytes is set up)
os.environ["LLM_LOAD_IN_4BIT"] = "1"  # or "0" to disable quantization
print("LOCAL_MODEL_PATH =", os.environ["LOCAL_MODEL_PATH"])
print("LLM_MODEL =", os.environ["LLM_MODEL"])

### If kernel doesnt recognize
LLM_MODEL=snapshot_path
LOCAL_MODEL_PATH=snapshot_path


In [None]:
# Commandline args universal
# MAX_NEW_TOKENS is purely for text generation count limit while max_position_embeddings is for context_length based on LLM config.json. !!! input_length + MAX_NEW_TOKENS shopuld be < context_length, otherwise LLM breaks. Llama 3 only has 8k context length/max_posiiton_embedding. SEALIONv3-LLama3-8B-IT uses ROPE, max_position_embeddings follows ROPE limit 131k, Qwen2.5-7B-IT has 32k context length, SahabatAIv1-LLama3-8B-IT has 8k context length.
# Counted the response for each steps in notebook output cell with tokens counter online, translations ~400 tokens, ~decomposition ~500 tokens, search_resolve ~700 tokens
# Change this every process (translate, decompose, search_resolve), different value is needed. Time is in seconds.
os.environ["LLM_WORKER_MAX_TIME"] = "300"
LLM_WORKER_MAX_TIME=300
os.environ["MAX_NEW_TOKENS"] = "1200"
MAX_NEW_TOKENS=1200
os.environ["BATCH_NUM"] = "1"
BATCH_NUM=1

print("LLM_WORKER_MAX_TIME =", os.environ["LLM_WORKER_MAX_TIME"])
print("MAX_NEW_TOKENS =", os.environ["MAX_NEW_TOKENS"])
print("BATCH_NUM =", os.environ["BATCH_NUM"])

In [None]:
# Naive prompting only requires True or False answer based on context
os.environ["MAX_NEW_TOKENS"] = "600"
MAX_NEW_TOKENS=600
os.environ["BATCH_NUM"] = "1"
BATCH_NUM=1

In [None]:
# Solving with naive prompting with explanations only
!python naive_prompting.py --data_path results_bahasa_translation --dataset_name ProntoQA --sample_pct 0 --prompts_folder manual_prompts_translated --prompts_file naive_prompting_explanations_before_answer --split dev --save_path results_translated_naive_prompting/prompt_explanations_before_answer --model_name $LLM_MODEL --batch_num $BATCH_NUM --max_new_tokens $MAX_NEW_TOKENS

In [None]:
!python evaluate.py --dataset_name ProntoQA --save_path results_translated_naive_prompting/prompt_explanations_before_answer --model_name $LLM_MODEL --evaluation_method naive_prompting

In [None]:
# Setting to exllama backend
os.environ["LLM_BACKEND"] = "llamacpp"

snapshot_path = "LLM_MODELS_GGUF/Qwen2.5-7B-Instruct-GGUF/qwen2.5-7b-instruct-q4_k_m.gguf"
#snapshot_path = "/workspace/LLM_MODELS_EXL2/Llama-SEA-LION-v3-8B-IT-EXL2-Indonesia-Focus"

os.environ["LOCAL_MODEL_PATH"] = snapshot_path
os.environ["LLM_MODEL"] = snapshot_path

######## enable 4-bit for quants (and bitsandbytes is set up)
print("LOCAL_MODEL_PATH =", os.environ["LOCAL_MODEL_PATH"])
print("LLM_MODEL =", os.environ["LLM_MODEL"])

### If kernel doesnt recognize
LLM_MODEL=snapshot_path
LOCAL_MODEL_PATH=snapshot_path

In [None]:
# Solving with naive prompting with explanations with llamacpp backend
!python naive_prompting.py --data_path results_bahasa_translation --dataset_name ProntoQA --sample_pct 0 --prompts_folder manual_prompts_translated --prompts_file naive_prompting_explanations_before_answer --split dev --save_path results_translated_naive_prompting/prompt_explanations_before_answer --model_name $LLM_MODEL --batch_num $BATCH_NUM --max_new_tokens $MAX_NEW_TOKENS

In [None]:
!python evaluate.py --dataset_name ProntoQA --save_path results_translated_naive_prompting/prompt_explanations_before_answer --model_name $LLM_MODEL --evaluation_method naive_prompting

In [None]:
# Setting to exllama backend
os.environ["LLM_BACKEND"] = "vllm"

snapshot_path = "LLM_MODELS/Qwen2.5-7B-Instruct"
#snapshot_path = "/workspace/LLM_MODELS/Llama-SEA-LION-v3-8B-IT"

os.environ["LOCAL_MODEL_PATH"] = snapshot_path
os.environ["LLM_MODEL"] = snapshot_path

print("LOCAL_MODEL_PATH =", os.environ["LOCAL_MODEL_PATH"])
print("LLM_MODEL =", os.environ["LLM_MODEL"])

### If kernel doesnt recognize
LLM_MODEL=snapshot_path
LOCAL_MODEL_PATH=snapshot_path

In [None]:
# Solving with naive prompting with explanations with vllm backend
!python naive_prompting.py --data_path results_bahasa_translation --dataset_name ProntoQA --sample_pct 0 --prompts_folder manual_prompts_translated --prompts_file naive_prompting_explanations_before_answer --split dev --save_path results_translated_naive_prompting/prompt_explanations_before_answer --model_name $LLM_MODEL --batch_num $BATCH_NUM --max_new_tokens $MAX_NEW_TOKENS

In [None]:
!python evaluate.py --dataset_name ProntoQA --save_path results_translated_naive_prompting/prompt_explanations_before_answer --model_name $LLM_MODEL --evaluation_method naive_prompting