In [1]:
# Install dependencies
!pip install fastapi uvicorn pyngrok llama-cpp-python

Collecting fastapi
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.32.1-py3-none-any.whl.metadata (6.6 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.1-py3-none-any.whl.metadata (8.3 kB)
Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.2.tar.gz (65.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.0/65.0 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting starlette<0.42.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.41.3-py3-none-any.whl.metadata (6.0 kB)
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Downloading fastapi-0.115.6-py3-none-any.whl (94 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━

In [2]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Import necessary modules
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from llama_cpp import Llama
from pyngrok import ngrok
import nest_asyncio
import uvicorn
from threading import Thread

In [4]:
# Allow nested event loops in Colab
nest_asyncio.apply()

# Initialize FastAPI
app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # Izinkan permintaan dari origins yang ditentukan
    allow_credentials=True,
    allow_methods=["*"],  # Izinkan semua metode HTTP (GET, POST, PUT, DELETE, dll.)
    allow_headers=["*"],  # Izinkan semua header
)

In [5]:
# Path model yang telah disalin ke Google Drive
model_path = "/content/drive/MyDrive/llama_model/unsloth.Q4_K_M.gguf"

# Memuat model dari Google Drive
llm = Llama(model_path)

llama_model_loader: loaded meta data with 27 key-value pairs and 292 tensors from /content/drive/MyDrive/llama_model/unsloth.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Meta Llama 3.1 8b Bnb 4bit
llama_model_loader: - kv   3:                       general.organization str              = Unsloth
llama_model_loader: - kv   4:                           general.finetune str              = bnb-4bit
llama_model_loader: - kv   5:                           general.basename str              = meta-llama-3.1
llama_model_loader: - kv   6:                         general.size_label str              = 8B
llama_model_loader: 

In [6]:
# Define request and response models
class ChatRequest(BaseModel):
    instruction: str
    input_data: str = ""

class ChatResponse(BaseModel):
    response: str

# Alpaca-style prompt template
alpaca_prompt = """Di bawah ini adalah instruksi yang menjelaskan tugas, dipasangkan dengan masukan yang memberikan konteks lebih lanjut. Tulis tanggapan yang melengkapi permintaan dengan tepat.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

In [7]:
# Root endpoint
@app.post("/chat", response_model=ChatResponse)
async def chat_completion(request: ChatRequest):
    # Format the instruction and input for the model
    prompt = alpaca_prompt.format(request.instruction, request.input_data, "")

    # Generate the chat completion
    result = llm.create_chat_completion(
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ]
    )

    # Extract and return the result
    response_text = result['choices'][0]['message']['content']

    return ChatResponse(response=response_text)

@app.get("/")
async def root():
    return {"message": "LLM Model is successfully running"}

In [8]:
!ngrok config add-authtoken YOUR_NGROK_AUTHTOKEN

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [9]:
# Run the server in a separate thread
def run():
    uvicorn.run(app, host="0.0.0.0", port=8000)

# Start ngrok tunnel
public_url = ngrok.connect(8000)
print("Public URL:", public_url)

# Run the FastAPI server
thread = Thread(target=run)
thread.start()

Public URL: NgrokTunnel: "https://c404-34-48-125-98.ngrok-free.app" -> "http://localhost:8000"
