In [None]:
# Nếu bạn chạy lại nhiều lần, cell này dọn repo cũ để tránh stale code
%cd /content
!rm -rf 23CLCT2_TraditionalMedicineChatbot

In [None]:
# Clone đúng branch
!git clone -b main --single-branch https://github.com/HuyTran28/23CLCT2_TraditionalMedicineChatbot.git
%cd 23CLCT2_TraditionalMedicineChatbot

!git rev-parse --abbrev-ref HEAD
!git log -1 --oneline

In [None]:
# Cài dependencies
!python -m pip install -q -r requirements.txt
!python -m pip install -q pyngrok

# Khuyến nghị cho chạy model trên GPU (device_map=auto) và 4bit (LOAD_IN_4BIT=1)
!python -m pip install -q -U accelerate bitsandbytes

## Cấu hình model (tuỳ chọn)
Script server đọc các biến môi trường:
- `HF_MODEL` hoặc `MODEL_ID` (mặc định: `Qwen/Qwen2.5-14B-Instruct`)
- `LOAD_IN_4BIT=1` để bật 4bit nếu môi trường hỗ trợ
- `HF_DEVICE_MAP=auto` (khuyên dùng)

Nếu bạn bật `LLM_API_KEY`, local phải gửi header `Authorization: Bearer <key>`.

In [None]:
import os

# Model
os.environ.setdefault("HF_MODEL", "Qwen/Qwen2.5-14B-Instruct")
# 4bit (đặt '0' nếu muốn tắt)
os.environ.setdefault("LOAD_IN_4BIT", "1")
os.environ.setdefault("HF_DEVICE_MAP", "auto")

# Bảo vệ API (tuỳ chọn). Nếu không muốn auth thì để trống / comment dòng dưới
# os.environ["LLM_API_KEY"] = "my-secret-token"

print("HF_MODEL=", os.environ.get("HF_MODEL"))
print("LOAD_IN_4BIT=", os.environ.get("LOAD_IN_4BIT"))
print("HF_DEVICE_MAP=", os.environ.get("HF_DEVICE_MAP"))
print("LLM_API_KEY set?", bool(os.environ.get("LLM_API_KEY")))

## Chạy server nền
Chạy nền để bạn vẫn có thể mở ngrok và test trong các cell kế tiếp.

Nếu server crash (thường do thiếu VRAM hoặc lỗi tải model), cell sẽ in log để bạn xem.

In [None]:
import os, subprocess, sys, time, requests

# Optional: avoid noisy progress bars that can bloat logs
os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")

# Optional: if you previously hit torchvision/torch mismatch in Colab, uncomment:
!pip -q uninstall -y torchvision

# IMPORTANT: don't pipe stdout to avoid deadlock when logs get large.
# Write logs to a file instead.
log_path = "server.log"
log_fp = open(log_path, "w")

print("Starting server...")
server_proc = subprocess.Popen(
    [sys.executable, "-u", "code/chatbot/scripts/colab_llm_server.py"],
    stdout=log_fp,
    stderr=subprocess.STDOUT,
    text=True,
)

# Model load can take several minutes. While loading, /health may refuse connections.
deadline = time.time() + 900  # 15 minutes
last_err = None

print("Waiting for /health (this can take 5–15 minutes on first run)...")
while time.time() < deadline:
    if server_proc.poll() is not None:
        print("Server exited with code:", server_proc.returncode)
        break
    try:
        r = requests.get("http://127.0.0.1:8000/health", timeout=2)
        if r.status_code == 200:
            print("READY:", r.status_code, r.text)
            break
    except Exception as e:
        last_err = e
        print(".", end="", flush=True)
        time.sleep(5)
else:
    print("\nTimeout waiting for /health. Last error:", last_err)

print("\n\n=== server.log (tail) ===")
log_fp.flush()
try:
    with open(log_path, "r") as f:
        tail = f.readlines()[-80:]
    print("".join(tail))
except Exception as e:
    print("Could not read server.log:", e)

In [None]:
# Test local trước khi mở ngrok
import requests

r = requests.get("http://127.0.0.1:8000/health", timeout=20)
print("status:", r.status_code)
print("body:", r.text)

## Mở ngrok
- Lấy authtoken tại https://dashboard.ngrok.com/get-started/your-authtoken
- Notebook sẽ ép IPv4 (`127.0.0.1:8000`) để tránh lỗi `dial tcp [::1]:8000 ... connection refused`.

In [None]:
from pyngrok import ngrok
import getpass
from google.colab import userdata

# Retrieve the token using google.colab.userdata
try:
    NGROK_TOKEN = userdata.get("NGROK_TOKEN")
except Exception:
    # Fallback or error if the secret name is wrong
    NGROK_TOKEN = None

if not NGROK_TOKEN:
    raise RuntimeError("NGROK_TOKEN not found. Check that the Secret name is exactly 'NGROK_TOKEN' and that you have granted access.")

ngrok.set_auth_token(NGROK_TOKEN)

# Dọn tunnel cũ nếu có
try:
    ngrok.kill()
except Exception:
    pass

public_url = ngrok.connect(addr="127.0.0.1:8000", proto="http")
print('LLM_API_BASE =', public_url.public_url)

In [None]:
# Test qua public URL (đừng gọi .json() ngay nếu bạn đang debug)
import os, requests

base = public_url.public_url

headers = {}
if os.getenv('LLM_API_KEY'):
    headers['Authorization'] = f"Bearer {os.getenv('LLM_API_KEY')}"

r = requests.get(f"{base}/health", timeout=30, headers=headers)
print('health status:', r.status_code)
print('health body:', r.text[:500])

# Keep this request tiny so it's fast even on CPU/offload.
payload = {
    'prompt': 'Trả lời đúng 1 ký tự: 2+2=?',
    'max_new_tokens': 4,
    'temperature': 0.0,
}
r2 = requests.post(f"{base}/v1/complete", json=payload, timeout=300, headers=headers)
print('complete status:', r2.status_code)
print('complete body:', r2.text[:500])

## Dùng URL này ở máy local (Windows PowerShell)
Sau khi có `LLM_API_BASE`, bên Windows bạn set biến môi trường như sau:

```powershell
$env:LLM_API_BASE="<paste LLM_API_BASE from colab>"
# Nếu bạn bật auth
$env:LLM_API_KEY="<same token as colab>"
```
Sau đó chạy query/webapp trên máy local.