In [8]:
!pip install -q fastapi uvicorn pyngrok nest-asyncio

# Install llama-cpp-python with CUDA support (pre-built wheel)
!pip install -q llama-cpp-python[server] starlette-context --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124

# Install other dependencies
!pip install -q "datasets>=2.18.0" "python-constraint>=1.4.0" "pandas>=2.0.0" "tqdm>=4.66.0"

!nvidia-smi

Sat Feb  7 07:30:56 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.105.08             Driver Version: 580.105.08     CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   46C    P8             13W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00

In [None]:
from huggingface_hub import hf_hub_download

print("Downloading model (or using cached)...")
MODEL_PATH = hf_hub_download(
    repo_id="unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF",
    filename="Qwen3-Coder-30B-A3B-Instruct-Q6_K.gguf",
)
print(f"Model path: {MODEL_PATH}")


print("Downloading model (or using cached)...")
MODEL_PATH = hf_hub_download(
    repo_id="unsloth/Qwen3-30B-A3B-Thinking-2507-GGUF",
    filename="Qwen3-30B-A3B-Thinking-2507-Q6_K.gguf",
)
print(f"Model path: {MODEL_PATH}")

print("Downloading model (or using cached)...")
MODEL_PATH = hf_hub_download(
    repo_id="unsloth/Qwen3-30B-A3B-Instruct-2507-GGUF",
    filename="Qwen3-30B-A3B-Instruct-2507-Q6_K.gguf",
)
print(f"Model path: {MODEL_PATH}")




Downloading model (or using cached)...
Model path: /root/.cache/huggingface/hub/models--mradermacher--Qwen3-Coder-30B-A3B-Instruct-i1-GGUF/snapshots/4234cd8b2fbbf28c29249f2174b5f3edf19e90e8/Qwen3-Coder-30B-A3B-Instruct.i1-IQ4_XS.gguf


In [None]:
import subprocess
import time
import threading
from pyngrok import ngrok

PORT = 8000
NGROK_TOKEN = ""  # Get from: https://dashboard.ngrok.com/get-started/your-authtoken

# Model settings
N_GPU_LAYERS = -1  # -1 = all layers on GPU

# =============================================================================
# Start llama-cpp-python built-in server
# =============================================================================
def start_llama_server():
    """Start the llama.cpp server as a subprocess."""
    cmd = [
        "python", "-m", "llama_cpp.server",
        "--model", MODEL_PATH,
        "--host", "0.0.0.0",
        "--port", str(PORT),
        "--n_gpu_layers", str(N_GPU_LAYERS),
        "--n_ctx", "32768",   
        "--n_batch", "2048",           
        "--flash_attn", "true",
        "--offload_kqv", "true",   # Offload KQV to GPU
        "--use_mlock", "false",    # Don't lock model in RAM
        "--use_mmap", "false", 
    ]
    
    print("Starting llama.cpp server...")
    print(f"Command: {' '.join(cmd)}")
    
    process = subprocess.Popen(
        cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        text=True,
        bufsize=1,
    )
    
    # Stream output
    for line in process.stdout:
        print(line, end="")
    
    return process

# =============================================================================
# Setup ngrok tunnel
# =============================================================================
print("Setting up ngrok tunnel...")
ngrok.set_auth_token(NGROK_TOKEN)
public_url = ngrok.connect(PORT)

print("\n" + "=" * 60)
print("üöÄ SERVER READY!")
print("=" * 60)
print(f"üì° Public URL: {public_url}")
print(f"üîó API Base:   {public_url}/v1")
print(f"üí¨ Chat:       {public_url}/v1/chat/completions")
print(f"üìã Models:     {public_url}/v1/models")
print(f"üìñ Docs:       {public_url}/docs")
print("=" * 60)


# =============================================================================
# Start server in background thread
# =============================================================================
server_thread = threading.Thread(target=start_llama_server, daemon=True)
server_thread.start()

# Wait for server to start
print("\n‚è≥ Waiting for server to initialize...")
time.sleep(10)

print("\n‚úÖ Server is running! Use the URL above to connect.")
print("Press Ctrl+C or interrupt kernel to stop.\n")

# Keep running
try:
    while True:
        time.sleep(1)
except KeyboardInterrupt:
    print("\nüõë Stopping server...")
    ngrok.disconnect(public_url)
    print("Done.")


Setting up ngrok tunnel...

üöÄ SERVER READY!
üì° Public URL: NgrokTunnel: "https://curtate-unkeeled-sam.ngrok-free.dev" -> "http://localhost:8000"
üîó API Base:   NgrokTunnel: "https://curtate-unkeeled-sam.ngrok-free.dev" -> "http://localhost:8000"/v1
üí¨ Chat:       NgrokTunnel: "https://curtate-unkeeled-sam.ngrok-free.dev" -> "http://localhost:8000"/v1/chat/completions
üìã Models:     NgrokTunnel: "https://curtate-unkeeled-sam.ngrok-free.dev" -> "http://localhost:8000"/v1/models
üìñ Docs:       NgrokTunnel: "https://curtate-unkeeled-sam.ngrok-free.dev" -> "http://localhost:8000"/docs
Starting llama.cpp server...
Command: python -m llama_cpp.server --model /root/.cache/huggingface/hub/models--mradermacher--Qwen3-Coder-30B-A3B-Instruct-i1-GGUF/snapshots/4234cd8b2fbbf28c29249f2174b5f3edf19e90e8/Qwen3-Coder-30B-A3B-Instruct.i1-IQ4_XS.gguf --host 0.0.0.0 --port 8000 --n_gpu_layers -1 --n_ctx 16384 --n_batch 16384 --chat_format chatml

‚è≥ Waiting for server to initialize...
ggml_cu

t=2026-02-07T07:33:48+0000 lvl=warn msg="failed to open private leg" id=a8490f5af304 privaddr=localhost:8000 err="dial tcp [::1]:8000: connect: connection refused"
t=2026-02-07T07:33:49+0000 lvl=warn msg="failed to open private leg" id=5c3b004c4980 privaddr=localhost:8000 err="dial tcp [::1]:8000: connect: connection refused"
t=2026-02-07T07:33:50+0000 lvl=warn msg="failed to open private leg" id=c37ede91c2d7 privaddr=localhost:8000 err="dial tcp [::1]:8000: connect: connection refused"


load_tensors: offloading 48 repeating layers to GPU
load_tensors: offloading output layer to GPU
load_tensors: offloaded 49/49 layers to GPU
load_tensors:        CUDA0 model buffer size =  7918.39 MiB
load_tensors:        CUDA1 model buffer size =  7528.35 MiB
load_tensors:   CPU_Mapped model buffer size =   157.65 MiB
Try increasing RLIMIT_MEMLOCK ('ulimit -l' as root).
.
llama_context: constructing llama_context
llama_context: n_seq_max     = 1
llama_context: n_ctx         = 16384
llama_context: n_ctx_per_seq = 16384
llama_context: n_batch       = 16384
llama_context: n_ubatch      = 512
llama_context: causal_attn   = 1
llama_context: flash_attn    = 0
llama_context: kv_unified    = false
llama_context: freq_base     = 10000000.0
llama_context: freq_scale    = 1
llama_context: n_ctx_per_seq (16384) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
set_abort_callback: call
llama_context:  CUDA_Host  output buffer size =     0.58 MiB
create_memory: n_ctx = 1