In [1]:
!pip install pynvml



In [2]:
import torch
import time
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlDeviceGetUtilizationRates, nvmlShutdown

import threading

In [3]:
class GPUMonitor:
    def __init__(self, monitoring_interval: float = 0.1, device_index: int = 0):
        self.monitoring_interval = monitoring_interval
        self.device_index = device_index
        self._gpu_memory_usage = []
        self._gpu_utilization = []
        self._is_monitoring = False
        self._thread = None

    def _monitor(self):
        nvmlInit()
        handle = nvmlDeviceGetHandleByIndex(self.device_index)
        while self._is_monitoring:
            mem_info = nvmlDeviceGetMemoryInfo(handle)
            util_info = nvmlDeviceGetUtilizationRates(handle)
            self._gpu_memory_usage.append(mem_info.used / (1024 ** 2))  # in MB
            self._gpu_utilization.append(util_info.gpu)
            time.sleep(self.monitoring_interval)
        nvmlShutdown()

    def start(self):
        self._is_monitoring = True
        self._thread = threading.Thread(target=self._monitor)
        self._thread.start()

    def stop(self):
        self._is_monitoring = False
        self._thread.join()

    def get_peak_usage(self):
        return {
            'peak_gpu_memory_mb': max(self._gpu_memory_usage, default=0),
            'p90_gpu_utilization': np.percentile(self._gpu_utilization, 90) if self._gpu_utilization else 0
        }

In [4]:
def benchmark_language_model(model, tokenizer, prompts, temperature=0.7, max_new_tokens=100):
    model.eval()
    model.cuda()

    gpu_monitor = GPUMonitor()
    gpu_monitor.start()

    ttft_list = []
    tps_list = []
    decode_tps_list = []

    for prompt in prompts:
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        input_len = inputs.input_ids.shape[-1]

        # Measure time to first token
        start_time = time.time()

        # Jalankan generate dan ukur waktu hingga token pertama (berdasarkan waktu nyata)
        with torch.no_grad():
            output = model.generate(
                **inputs,
                do_sample=True,
                temperature=temperature,
                max_new_tokens=max_new_tokens,
                return_dict_in_generate=True,
                output_scores=False  # tidak perlu lagi
            )

        gen_time = time.time() - start_time
        ttft = gen_time / max(output.sequences.shape[-1] - input_len, 1)


        total_tokens = output.sequences.shape[-1]
        output_tokens = total_tokens - input_len

        ttft_list.append(ttft)
        tps_list.append(total_tokens / gen_time)
        decode_tps_list.append(output_tokens / gen_time)

    gpu_monitor.stop()
    gpu_stats = gpu_monitor.get_peak_usage()

    return {
        'p90_ttft_seconds': np.percentile(ttft_list, 90),
        'p90_total_tps': np.percentile(tps_list, 90),
        'p90_output_decode_tps': np.percentile(decode_tps_list, 90),
        'max_gpu_memory_mb': gpu_stats['peak_gpu_memory_mb'],
        'p90_gpu_utilization': gpu_stats['p90_gpu_utilization'],
    }

In [5]:
# model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B").cuda()
# tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")

# test_prompts = [
#     "Short prompt.",
#     "This is a medium length prompt with some context and words.",
#     "This is a very long prompt designed to simulate a large context. " * 10
# ]

# results = benchmark_language_model(
#     model=model,
#     tokenizer=tokenizer,
#     prompts=test_prompts,
#     temperature=0.7,
#     max_new_tokens=100
# )

# print(f"P90 TPS: {results['p90_total_tps']:.2f}")
# print(f"P90 TTFT: {results['p90_ttft_seconds']:.4f} seconds")
# print(f"P90 Output Decode TPS: {results['p90_output_decode_tps']:.2f}")
# print(f"Max GPU Memory: {results['max_gpu_memory_mb']:.2f} MB")
# print(f"P90 GPU Utilization: {results['p90_gpu_utilization']:.2f}%")

In [6]:
# import torch
# import gc
# from transformers import AutoModelForCausalLM, AutoTokenizer
# import numpy as np

# def compare_llms(model_names, prompts, temperature=0.7, max_new_tokens=100):
#     all_results = {}

#     for model_name in model_names:
#         print(f"\n🔍 Benchmarking model: {model_name}")

#         try:
#             # Load model dan tokenizer
#             model = AutoModelForCausalLM.from_pretrained(
#                 model_name,
#                 torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
#             ).cuda()
#             tokenizer = AutoTokenizer.from_pretrained(model_name)

#             # Setup tokenizer defaults if missing
#             if tokenizer.pad_token is None:
#                 tokenizer.pad_token = tokenizer.eos_token

#             # Benchmark
#             result = benchmark_language_model(
#                 model=model,
#                 tokenizer=tokenizer,
#                 prompts=prompts,
#                 temperature=temperature,
#                 max_new_tokens=max_new_tokens
#             )
#             all_results[model_name] = result

#         except Exception as e:
#             print(f"⚠️ Failed to benchmark {model_name}: {e}")
#             all_results[model_name] = {"error": str(e)}

#         # Clean up to prevent OOM
#         del model
#         del tokenizer
#         gc.collect()
#         torch.cuda.empty_cache()
#         torch.cuda.ipc_collect()

#     return all_results



# def print_comparison_table(results):
#     print("\n📊 Benchmark Comparison Table:")
#     headers = [
#         "Model",
#         "P90 TTFT (s)",
#         "P90 TPS",
#         "P90 Decode TPS",
#         "Max GPU Mem (MB)",
#         "P90 GPU Util (%)"
#     ]
#     print(f"{headers[0]:<45} {headers[1]:<15} {headers[2]:<10} {headers[3]:<15} {headers[4]:<18} {headers[5]}")
#     print("-" * 120)

#     for model_name, metrics in results.items():
#         print(f"{model_name:<45} "
#               f"{metrics['p90_ttft_seconds']:<15.4f} "
#               f"{metrics['p90_total_tps']:<10.2f} "
#               f"{metrics['p90_output_decode_tps']:<15.2f} "
#               f"{metrics['max_gpu_memory_mb']:<18.2f} "
#               f"{metrics['p90_gpu_utilization']:<.2f}")


# # ===========================
# # 🧪 Example Usage
# # ===========================
# model_names = [
#     "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
#     "tiiuae/falcon-rw-1b"
# ]

# test_prompts = [
#     "Short prompt.",
#     "This is a medium length prompt with some context and words.",
#     "This is a very long prompt designed to simulate a large context. " * 10
# ]

# results = compare_llms(
#     model_names=model_names,
#     prompts=test_prompts,
#     temperature=0.7,
#     max_new_tokens=100
# )

# print_comparison_table(results)


In [7]:
%%writefile compare_llms.py
import torch
import time
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlDeviceGetUtilizationRates, nvmlShutdown
import torch
import gc
from transformers import AutoModelForCausalLM, AutoTokenizer
import numpy as np
import threading

class GPUMonitor:
    def __init__(self, monitoring_interval: float = 0.1, device_index: int = 0):
        self.monitoring_interval = monitoring_interval
        self.device_index = device_index
        self._gpu_memory_usage = []
        self._gpu_utilization = []
        self._is_monitoring = False
        self._thread = None

    def _monitor(self):
        nvmlInit()
        handle = nvmlDeviceGetHandleByIndex(self.device_index)
        while self._is_monitoring:
            mem_info = nvmlDeviceGetMemoryInfo(handle)
            util_info = nvmlDeviceGetUtilizationRates(handle)
            self._gpu_memory_usage.append(mem_info.used / (1024 ** 2))  # in MB
            self._gpu_utilization.append(util_info.gpu)
            time.sleep(self.monitoring_interval)
        nvmlShutdown()

    def start(self):
        self._is_monitoring = True
        self._thread = threading.Thread(target=self._monitor)
        self._thread.start()

    def stop(self):
        self._is_monitoring = False
        self._thread.join()

    def get_peak_usage(self):
        return {
            'peak_gpu_memory_mb': max(self._gpu_memory_usage, default=0),
            'p90_gpu_utilization': np.percentile(self._gpu_utilization, 90) if self._gpu_utilization else 0
        }

def benchmark_language_model(model, tokenizer, prompts, temperature=0.7, max_new_tokens=100):
    model.eval()
    model.cuda()

    gpu_monitor = GPUMonitor()
    gpu_monitor.start()

    ttft_list = []
    tps_list = []
    decode_tps_list = []

    for prompt in prompts:
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        input_len = inputs.input_ids.shape[-1]

        # Measure time to first token
        start_time = time.time()

        # Jalankan generate dan ukur waktu hingga token pertama (berdasarkan waktu nyata)
        with torch.no_grad():
            output = model.generate(
                **inputs,
                do_sample=True,
                temperature=temperature,
                max_new_tokens=max_new_tokens,
                return_dict_in_generate=True,
                output_scores=False  # tidak perlu lagi
            )

        gen_time = time.time() - start_time
        ttft = gen_time / max(output.sequences.shape[-1] - input_len, 1)


        total_tokens = output.sequences.shape[-1]
        output_tokens = total_tokens - input_len

        ttft_list.append(ttft)
        tps_list.append(total_tokens / gen_time)
        decode_tps_list.append(output_tokens / gen_time)

    gpu_monitor.stop()
    gpu_stats = gpu_monitor.get_peak_usage()

    return {
        'p90_ttft_seconds': np.percentile(ttft_list, 90),
        'p90_total_tps': np.percentile(tps_list, 90),
        'p90_output_decode_tps': np.percentile(decode_tps_list, 90),
        'max_gpu_memory_mb': gpu_stats['peak_gpu_memory_mb'],
        'p90_gpu_utilization': gpu_stats['p90_gpu_utilization'],
    }

def compare_llms(model_names, prompts, temperature=0.7, max_new_tokens=100):
    all_results = {}

    for model_name in model_names:
        print(f"\n🔍 Benchmarking model: {model_name}")

        try:
            # Load model dan tokenizer
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
            ).cuda()
            tokenizer = AutoTokenizer.from_pretrained(model_name)

            # Setup tokenizer defaults if missing
            if tokenizer.pad_token is None:
                tokenizer.pad_token = tokenizer.eos_token

            # Benchmark
            result = benchmark_language_model(
                model=model,
                tokenizer=tokenizer,
                prompts=prompts,
                temperature=temperature,
                max_new_tokens=max_new_tokens
            )
            all_results[model_name] = result

        except Exception as e:
            print(f"⚠️ Failed to benchmark {model_name}: {e}")
            all_results[model_name] = {"error": str(e)}

        # Clean up to prevent OOM
        del model
        del tokenizer
        gc.collect()
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()

    return all_results

def print_comparison_table(results):
    print("\n📊 Benchmark Comparison Table:")
    headers = [
        "Model",
        "P90 TTFT (s)",
        "P90 TPS",
        "P90 Decode TPS",
        "Max GPU Mem (MB)",
        "P90 GPU Util (%)"
    ]
    print(f"{headers[0]:<45} {headers[1]:<15} {headers[2]:<10} {headers[3]:<15} {headers[4]:<18} {headers[5]}")
    print("-" * 120)

    for model_name, metrics in results.items():
        print(f"{model_name:<45} "
              f"{metrics['p90_ttft_seconds']:<15.4f} "
              f"{metrics['p90_total_tps']:<10.2f} "
              f"{metrics['p90_output_decode_tps']:<15.2f} "
              f"{metrics['max_gpu_memory_mb']:<18.2f} "
              f"{metrics['p90_gpu_utilization']:<.2f}")


Writing compare_llms.py


In [8]:
!pip install pyngrok streamlit

Collecting pyngrok
  Downloading pyngrok-7.2.12-py3-none-any.whl.metadata (9.4 kB)
Collecting streamlit
  Downloading streamlit-1.46.1-py3-none-any.whl.metadata (9.0 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading pyngrok-7.2.12-py3-none-any.whl (26 kB)
Downloading streamlit-1.46.1-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.wh

In [9]:
%%writefile app.py
import streamlit as st
from compare_llms import compare_llms, print_comparison_table
import pandas as pd

st.set_page_config(page_title="LLM Benchmark Comparison", layout="wide")
st.title("🔍 Benchmark & Compare Language Models (LLMs)")
st.markdown("Benchmark LLMs like `DeepSeek`, `Mistral`, `Falcon`, etc. with custom prompts.")

# ===== Step 1: Model Selection =====
st.header("1. Choose Models to Benchmark")
default_models = [
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    "tiiuae/falcon-rw-1b",
    "mistralai/Mistral-7B-Instruct-v0.2"
]
models_input = st.text_area("Enter model names (one per line)", "\n".join(default_models))
model_names = [m.strip() for m in models_input.splitlines() if m.strip()]

# ===== Step 2: Prompt Input =====
st.header("2. Enter Prompts (You can add multiple)")
prompt_container = st.container()
prompts = []

# Session state for dynamic prompt inputs
if "prompt_count" not in st.session_state:
    st.session_state.prompt_count = 1

for i in range(st.session_state.prompt_count):
    prompt = prompt_container.text_input(f"Prompt {i+1}", key=f"prompt_{i}")
    if prompt:
        prompts.append(prompt)

# Button to add more prompt inputs
if prompt_container.button("➕ Add another prompt"):
    st.session_state.prompt_count += 1

# ===== Step 3: Benchmark Settings =====
st.header("3. Benchmark Settings")
temperature = st.slider("Temperature", 0.0, 1.0, 0.7, 0.1)
max_new_tokens = st.slider("Max New Tokens", 10, 200, 100, 10)

# ===== Step 4: Run Benchmark =====
st.header("4. Run Benchmark")
if st.button("🚀 Run Benchmark"):
    if not model_names:
        st.warning("Please enter at least one model.")
    elif not prompts:
        st.warning("Please enter at least one prompt.")
    else:
        with st.spinner("Running benchmark... this may take several minutes..."):
            results = compare_llms(
                model_names=model_names,
                prompts=prompts,
                temperature=temperature,
                max_new_tokens=max_new_tokens
            )

        # Show results table
        st.success("✅ Benchmark complete!")
        st.subheader("📊 Results Table")

        rows = []
        for model_name, metrics in results.items():
            if "error" in metrics:
                rows.append({
                    "Model": model_name,
                    "P90 TTFT (s)": "ERROR",
                    "P90 TPS": "ERROR",
                    "P90 Decode TPS": "ERROR",
                    "Max GPU Mem (MB)": "ERROR",
                    "P90 GPU Util (%)": "ERROR"
                })
            else:
                rows.append({
                    "Model": model_name,
                    "P90 TTFT (s)": round(metrics['p90_ttft_seconds'], 4),
                    "P90 TPS": round(metrics['p90_total_tps'], 2),
                    "P90 Decode TPS": round(metrics['p90_output_decode_tps'], 2),
                    "Max GPU Mem (MB)": round(metrics['max_gpu_memory_mb'], 2),
                    "P90 GPU Util (%)": round(metrics['p90_gpu_utilization'], 2),
                })

        df = pd.DataFrame(rows)
        st.dataframe(df, use_container_width=True)

Writing app.py


In [10]:
from pyngrok import ngrok
import os
import threading

NGROK_AUTH_TOKEN = "YOUR_AUTH_TOKEN" #@param {type:"string"}

!ngrok authtoken $NGROK_AUTH_TOKEN

# Set streamlit port
port = 8501

# Hentikan dulu ngrok lama jika ada
ngrok.kill()

# Share link Ngrok
public_url = ngrok.connect(port)
print(f"🔗 Public URL: {public_url}")

# Jalankan Streamlit di thread terpisah agar tidak blocking
def run_streamlit():
    os.system(f"streamlit run app.py --server.port {port}")

thread = threading.Thread(target=run_streamlit)
thread.start()


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
🔗 Public URL: NgrokTunnel: "https://be3e3eafaa2d.ngrok-free.app" -> "http://localhost:8501"
