# Llama 2 Fastapi Service Example

推薦至少使用 T4 GPU 來作為你的服務啟用

## Step 0: Config Setting

In [1]:
# 如果你想要在 Google Colab 長期測試，你可以使用 ngrok 來做服務器代理的處理
# 到官方網站註冊帳號：https://ngrok.com/
# 申請token，你就可以將以下變數進行更換。
NGROK_TOKEN = None

In [2]:
# GGML Model
# 你可以到 HuggingFace 去找尋相關的 GGML 模型
# Example:
# Llama: https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML
# Taiwan Llama: https://huggingface.co/audreyt/Taiwan-LLaMa-v1.0-GGML
GGML_HUGGINGFACE_REPO = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF"
GGML_HUGGINGFACE_BIN_FILE = "mistral-7b-instruct-v0.1.Q5_0.gguf"

## Step 1: Install python package

In [3]:
# 安裝 fastapi, nest-asyncio, pyngrok, uvicorn, accelerate 和 transformers 套件，以支援API開發和深度學習模型的操作。
!pip install fastapi nest-asyncio pyngrok uvicorn accelerate transformers

Collecting fastapi
  Downloading fastapi-0.103.2-py3-none-any.whl (66 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/66.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.3/66.3 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyngrok
  Downloading pyngrok-7.0.0.tar.gz (718 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m718.7/718.7 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting uvicorn
  Downloading uvicorn-0.23.2-py3-none-any.whl (59 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.5/59.5 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading trans

In [4]:
# 安裝特定版本的 llama-cpp-python 套件，並啟用 CUDA 的 cuBLAS 功能。
# `--force-reinstall` 會強制重新安裝，`--upgrade` 會確保安裝最新版本，而 `--no-cache-dir` 會避免使用本地快取，`--verbose` 提供詳細的輸出。
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python==0.2.6 --force-reinstall --upgrade --no-cache-dir --verbose

Using pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)
Collecting llama-cpp-python==0.2.6
  Downloading llama_cpp_python-0.2.6.tar.gz (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Running command pip subprocess to install build dependencies
  Using pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)
  Collecting scikit-build-core[pyproject]>=0.5.0
    Downloading scikit_build_core-0.5.1-py3-none-any.whl (130 kB)
       ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 130.7/130.7 kB 3.3 MB/s eta 0:00:00
  Collecting exceptiongroup (from scikit-build-core[pyproject]>=0.5.0)
    Downloading exceptiongroup-1.1.3-py3-none-any.whl (14 kB)
  Collecting packaging>=20.9 (from scikit-build-core[pyproject]>=0.5.0)
    Downloading packaging-23.2-py3-none-any.whl (53 kB)
       ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 53.0/53.0 kB 4.9 MB/s eta 0:00:00
  Collecting tomli>=1

In [5]:
# 安裝 huggingface_hub 套件，此套件可支援與 Hugging Face Model Hub 進行交互。
!pip install huggingface_hub



## Step 2: Download GGML Model and predict the result

In [6]:
import json
import logging
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

# 設定日誌
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s [%(levelname)s] %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(__name__)

class Model:
    def __init__(self):
        self.loaded = False        # 模型是否已經加載的標志
        self.lcpp_llm = None       # 儲存 Llama 模型的變數
        self.model_path = ""       # 模型的路徑

    def load(self, model_name_or_path = "TheBloke/Llama-2-13B-chat-GGML", model_basename = "llama-2-13b-chat.ggmlv3.q5_1.bin"):
        # 從 Hugging Face Model Hub 下載模型並設定其路徑
        self.model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)
        logger.info("Finish: Load Llama 2 model.")  # 輸出模型加載完成的信息

    def predict(self, data):
        # 如果模型還沒有被加載，則加載模型
        if not self.loaded:
            self.loaded = True
            self.lcpp_llm = Llama(
                model_path=self.model_path,
                n_threads=2,             # 使用的執行緒數量
                n_batch=1024,            # 批次大小
                n_gpu_layers=32          # 使用的GPU層數
            )
        logger.info("========== Start ==========")
        # 將 JSON 字符串反序列化成字典
        # data_dict = json.loads(data)
        # logger.info("Input: {}.".format(data_dict))
        # 使用 Llama 模型進行預測
        response = self.lcpp_llm.create_chat_completion(**data)
        # logger.info("Response: {}.".format(response))
        logger.info("==========  End  ==========")

        return response  # 返回模型的預測結果

In [7]:
model_instance = Model()
model_instance.load(model_name_or_path = GGML_HUGGINGFACE_REPO, model_basename = GGML_HUGGINGFACE_BIN_FILE)

Downloading (…)truct-v0.1.Q5_0.gguf:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

In [8]:
data = {
  "messages": [
    {
      "role": "system",
      "content": "你是一個有幫助的問答機器人，請用繁體中文回覆。"
    },
    {
      "role": "user",
      "content": "台灣首都在哪裏？"
    }
  ]
}

In [9]:
model_instance.predict(data)

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 


{'id': 'chatcmpl-b82e3519-f0d6-4c56-80ab-9c3d52ee8886',
 'object': 'chat.completion',
 'created': 1697093944,
 'model': '/root/.cache/huggingface/hub/models--TheBloke--Mistral-7B-Instruct-v0.1-GGUF/snapshots/45167a542b6fa64a14aea61a4c468bbbf9f258a8/mistral-7b-instruct-v0.1.Q5_0.gguf',
 'choices': [{'index': 0,
   'message': {'role': 'assistant', 'content': '台灣首都在台北。'},
   'finish_reason': 'stop'}],
 'usage': {'prompt_tokens': 45, 'completion_tokens': 8, 'total_tokens': 53}}

## Step 3: Build the fastapi service

In [10]:
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware

# 初始化 FastAPI 應用
app = FastAPI()

# 為 FastAPI 應用加入 CORS 中間件，允許跨域請求
app.add_middleware(
    CORSMiddleware,
    allow_origins=['*'],             # 允許所有來源的跨域請求
    allow_credentials=True,          # 允許憑證（例如 cookies、HTTP認證）的傳遞
    allow_methods=['*'],             # 允許所有的 HTTP 方法
    allow_headers=['*'],             # 允許所有的 HTTP 頭部
)

@app.post("/predict")                # 定義一個 POST 路由，用於模型預測
async def predict_text(json_input: dict):  # 接收一個字典格式的 JSON 輸入
    result = model_instance.predict(json_input)  # 使用模型實例進行預測
    return result                           # 返回預測結果


## Step 4: Start the fastapi service

In [None]:
import nest_asyncio
from pyngrok import ngrok
import uvicorn

# 設定 ngrok 的授權令牌
if NGROK_TOKEN is not None:
    ngrok.set_auth_token(NGROK_TOKEN)

# 建立與 ngrok 的隧道，使外部可以訪問本地的 8000 端口
ngrok_tunnel = ngrok.connect(8000)
public_url = ngrok_tunnel.public_url

print('Public URL:', public_url)  # 輸出公開的 URL
print("You can use {}/predict to get the assistant result.".format(public_url))


# 使用 nest_asyncio 修正異步事件循環的問題
nest_asyncio.apply()

# 啟動 uvicorn 伺服器，使 FastAPI 應用運行在 8000 端口
uvicorn.run(app, port=8000)




INFO:     Started server process [646]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


Public URL: https://8dd9-104-199-121-204.ngrok-free.app
You can use https://8dd9-104-199-121-204.ngrok-free.app/predict to get the assistant result.


Llama.generate: prefix-match hit


INFO:     35.229.163.27:0 - "POST /predict HTTP/1.1" 200 OK


### Example CURL command line:

```bash
curl --location 'https://f1b8-35-184-42-82.ngrok-free.app/predict' \
--header 'Content-Type: application/json' \
--data '{"prompt": "test", "max_tokens": 2}'
```