From 602e174241b9ff34ecdb93194ed22e23080ee5c1 Mon Sep 17 00:00:00 2001 From: Michael Yuan Date: Wed, 9 Jul 2025 06:45:34 +0000 Subject: [PATCH 1/2] Refactor the docs Signed-off-by: Michael Yuan --- docs/ai-models/_category_.json | 8 + .../embeddings}/_category_.json | 4 +- docs/ai-models/embeddings/index.md | 66 ++++++++ docs/ai-models/index.md | 33 ++++ .../llamaedge-docker.md | 0 .../llamaedge-kubernetes.md | 0 .../llm/_category_.json | 0 .../llm/api-reference.md | 0 .../llm/quick-start-llm.md} | 37 +++- .../llm/tool-call.md | 0 .../multimodal/_category_.json | 0 .../multimodal/gemma-3.md | 0 .../multimodal/llava.md | 0 .../multimodal/medgemma-4b.md | 0 .../multimodal/medgemma.jpg | Bin .../multimodal/qwen2-5.md | 0 .../speech-to-text/_category_.json | 0 .../speech-to-text/api-reference.md | 0 .../speech-to-text/cli.md | 0 .../speech-to-text/quick-start-whisper.md | 0 .../text-to-image/_category_.json | 0 .../text-to-image/flux.md | 0 .../text-to-image/quick-start-sd.md | 0 .../text-to-speech/_category_.json | 0 .../text-to-speech/gpt-sovits.md | 0 docs/developer-guide/_category_.json | 8 - .../create-embeddings-collection.md | 156 ----------------- docs/developer-guide/multimodal-app.md | 7 - docs/developer-guide/rag-api-server.md | 7 - docs/inference-sdk/_category_.json | 8 + .../basic-llm-app.md | 0 .../chatbot-llm-app.md | 0 .../embedding-app.md | 0 docs/llama-nexus/_category_.json | 7 +- docs/llama-nexus/llama-nexus.md | 1 + docs/llama-nexus/mcp/_category_.json | 5 +- docs/llama-nexus/mcp/quick-start-with-mcp.md | 5 +- .../openai-api/_category_.json | 0 .../openai-api/agent-zero-01.png | Bin .../openai-api/agent-zero-02.png | Bin .../openai-api/agent-zero-03.png | Bin .../openai-api/agent-zero-04.png | Bin .../openai-api/agent-zero-05.png | Bin .../openai-api/agent-zero-06.png | Bin .../openai-api/agent-zero-07.png | Bin .../openai-api/agent-zero-08.png | Bin .../openai-api/agent-zero-09.png | Bin .../openai-api/agent-zero.md | 0 .../openai-api/continue-01.png | Bin .../openai-api/continue.md | 0 .../openai-api/flowise-tool-01.png | Bin .../openai-api/flowise-tool-02.png | Bin .../openai-api/flowise-tool-03.png | Bin .../openai-api/flowise-tool-04.png | Bin .../openai-api/flowise-tool-05.png | Bin .../openai-api/flowise-tool-06.png | Bin .../openai-api/flowiseai-tool-call.md | 0 .../openai-api/img/docsVersionDropdown.png | Bin .../openai-api/img/localeDropdown.png | Bin .../openai-api/intro.md | 0 .../openai-api/langchain.md | 0 .../openai-api/lobechat-llamaedge-01.png | Bin .../openai-api/lobechat-llamaedge-02.png | Bin .../openai-api/lobechat.md | 0 .../openai-api/obsidian-configure.png | Bin .../openai-api/obsidian-enable.png | Bin .../openai-api/obsidian-extract.png | Bin .../openai-api/obsidian-grammar.png | Bin .../openai-api/obsidian-hotkey.png | Bin .../openai-api/obsidian-model.png | Bin .../openai-api/obsidian-summarization.png | Bin .../openai-api/obsidian-text-continuation.png | Bin .../openai-api/obsidian.md | 0 .../openai-api/translation-agent.md | 0 .../openai-api/translation-agent.png | Bin docs/llama-nexus/quick-start.md | 4 +- docs/llama-nexus/register.md | 15 +- docs/user-guide/index.md | 29 ---- docs/user-guide/llm/full-openai.md | 75 --------- .../llm/server-side-rag/_category_.json | 7 - .../llm/server-side-rag/markdown.md | 106 ------------ .../llm/server-side-rag/quick-start.md | 159 ------------------ .../llm/server-side-rag/rag-service.md | 40 ----- docs/user-guide/llm/server-side-rag/text.md | 120 ------------- 84 files changed, 169 insertions(+), 738 deletions(-) create mode 100644 docs/ai-models/_category_.json rename docs/{user-guide => ai-models/embeddings}/_category_.json (54%) create mode 100644 docs/ai-models/embeddings/index.md create mode 100644 docs/ai-models/index.md rename docs/{user-guide => ai-models}/llamaedge-docker.md (100%) rename docs/{user-guide => ai-models}/llamaedge-kubernetes.md (100%) rename docs/{user-guide => ai-models}/llm/_category_.json (100%) rename docs/{user-guide => ai-models}/llm/api-reference.md (100%) rename docs/{user-guide/llm/get-started-with-llamaedge.md => ai-models/llm/quick-start-llm.md} (65%) rename docs/{user-guide => ai-models}/llm/tool-call.md (100%) rename docs/{user-guide => ai-models}/multimodal/_category_.json (100%) rename docs/{user-guide => ai-models}/multimodal/gemma-3.md (100%) rename docs/{user-guide => ai-models}/multimodal/llava.md (100%) rename docs/{user-guide => ai-models}/multimodal/medgemma-4b.md (100%) rename docs/{user-guide => ai-models}/multimodal/medgemma.jpg (100%) rename docs/{user-guide => ai-models}/multimodal/qwen2-5.md (100%) rename docs/{user-guide => ai-models}/speech-to-text/_category_.json (100%) rename docs/{user-guide => ai-models}/speech-to-text/api-reference.md (100%) rename docs/{user-guide => ai-models}/speech-to-text/cli.md (100%) rename docs/{user-guide => ai-models}/speech-to-text/quick-start-whisper.md (100%) rename docs/{user-guide => ai-models}/text-to-image/_category_.json (100%) rename docs/{user-guide => ai-models}/text-to-image/flux.md (100%) rename docs/{user-guide => ai-models}/text-to-image/quick-start-sd.md (100%) rename docs/{user-guide => ai-models}/text-to-speech/_category_.json (100%) rename docs/{user-guide => ai-models}/text-to-speech/gpt-sovits.md (100%) delete mode 100644 docs/developer-guide/_category_.json delete mode 100644 docs/developer-guide/create-embeddings-collection.md delete mode 100644 docs/developer-guide/multimodal-app.md delete mode 100644 docs/developer-guide/rag-api-server.md create mode 100644 docs/inference-sdk/_category_.json rename docs/{developer-guide => inference-sdk}/basic-llm-app.md (100%) rename docs/{developer-guide => inference-sdk}/chatbot-llm-app.md (100%) rename docs/{developer-guide => inference-sdk}/embedding-app.md (100%) rename docs/{user-guide => llama-nexus}/openai-api/_category_.json (100%) rename docs/{user-guide => llama-nexus}/openai-api/agent-zero-01.png (100%) rename docs/{user-guide => llama-nexus}/openai-api/agent-zero-02.png (100%) rename docs/{user-guide => llama-nexus}/openai-api/agent-zero-03.png (100%) rename docs/{user-guide => llama-nexus}/openai-api/agent-zero-04.png (100%) rename docs/{user-guide => llama-nexus}/openai-api/agent-zero-05.png (100%) rename docs/{user-guide => llama-nexus}/openai-api/agent-zero-06.png (100%) rename docs/{user-guide => llama-nexus}/openai-api/agent-zero-07.png (100%) rename docs/{user-guide => llama-nexus}/openai-api/agent-zero-08.png (100%) rename docs/{user-guide => llama-nexus}/openai-api/agent-zero-09.png (100%) rename docs/{user-guide => llama-nexus}/openai-api/agent-zero.md (100%) rename docs/{user-guide => llama-nexus}/openai-api/continue-01.png (100%) rename docs/{user-guide => llama-nexus}/openai-api/continue.md (100%) rename docs/{user-guide => llama-nexus}/openai-api/flowise-tool-01.png (100%) rename docs/{user-guide => llama-nexus}/openai-api/flowise-tool-02.png (100%) rename docs/{user-guide => llama-nexus}/openai-api/flowise-tool-03.png (100%) rename docs/{user-guide => llama-nexus}/openai-api/flowise-tool-04.png (100%) rename docs/{user-guide => llama-nexus}/openai-api/flowise-tool-05.png (100%) rename docs/{user-guide => llama-nexus}/openai-api/flowise-tool-06.png (100%) rename docs/{user-guide => llama-nexus}/openai-api/flowiseai-tool-call.md (100%) rename docs/{user-guide => llama-nexus}/openai-api/img/docsVersionDropdown.png (100%) rename docs/{user-guide => llama-nexus}/openai-api/img/localeDropdown.png (100%) rename docs/{user-guide => llama-nexus}/openai-api/intro.md (100%) rename docs/{user-guide => llama-nexus}/openai-api/langchain.md (100%) rename docs/{user-guide => llama-nexus}/openai-api/lobechat-llamaedge-01.png (100%) rename docs/{user-guide => llama-nexus}/openai-api/lobechat-llamaedge-02.png (100%) rename docs/{user-guide => llama-nexus}/openai-api/lobechat.md (100%) rename docs/{user-guide => llama-nexus}/openai-api/obsidian-configure.png (100%) rename docs/{user-guide => llama-nexus}/openai-api/obsidian-enable.png (100%) rename docs/{user-guide => llama-nexus}/openai-api/obsidian-extract.png (100%) rename docs/{user-guide => llama-nexus}/openai-api/obsidian-grammar.png (100%) rename docs/{user-guide => llama-nexus}/openai-api/obsidian-hotkey.png (100%) rename docs/{user-guide => llama-nexus}/openai-api/obsidian-model.png (100%) rename docs/{user-guide => llama-nexus}/openai-api/obsidian-summarization.png (100%) rename docs/{user-guide => llama-nexus}/openai-api/obsidian-text-continuation.png (100%) rename docs/{user-guide => llama-nexus}/openai-api/obsidian.md (100%) rename docs/{user-guide => llama-nexus}/openai-api/translation-agent.md (100%) rename docs/{user-guide => llama-nexus}/openai-api/translation-agent.png (100%) delete mode 100644 docs/user-guide/index.md delete mode 100644 docs/user-guide/llm/full-openai.md delete mode 100644 docs/user-guide/llm/server-side-rag/_category_.json delete mode 100644 docs/user-guide/llm/server-side-rag/markdown.md delete mode 100644 docs/user-guide/llm/server-side-rag/quick-start.md delete mode 100644 docs/user-guide/llm/server-side-rag/rag-service.md delete mode 100644 docs/user-guide/llm/server-side-rag/text.md diff --git a/docs/ai-models/_category_.json b/docs/ai-models/_category_.json new file mode 100644 index 0000000..1e2d81b --- /dev/null +++ b/docs/ai-models/_category_.json @@ -0,0 +1,8 @@ +{ + "label": "Serve AI models", + "position": 5, + "link": { + "type": "generated-index", + "description": "Serve open-source AI models via web APIs." + } +} diff --git a/docs/user-guide/_category_.json b/docs/ai-models/embeddings/_category_.json similarity index 54% rename from docs/user-guide/_category_.json rename to docs/ai-models/embeddings/_category_.json index 8c1fd29..fd0d186 100644 --- a/docs/user-guide/_category_.json +++ b/docs/ai-models/embeddings/_category_.json @@ -1,6 +1,6 @@ { - "label": "User Guide", - "position": 5, + "label": "Embeddings", + "position": 1, "link": { "type": "generated-index" } diff --git a/docs/ai-models/embeddings/index.md b/docs/ai-models/embeddings/index.md new file mode 100644 index 0000000..62cd3cb --- /dev/null +++ b/docs/ai-models/embeddings/index.md @@ -0,0 +1,66 @@ +--- +sidebar_position: 1 +--- + +# Working with embedding models + +Embedding models compute vectors from text inputs. The vectors can then be used as search index +for semantic search in a vector database. + +### Step 1: Install WasmEdge + +First off, you'll need WasmEdge, a high-performance, lightweight, and extensible WebAssembly (Wasm) runtime optimized for server-side and edge computing. To install WasmEdge along with the necessary plugin for AI inference, open your terminal and execute the following command: + +``` +curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install_v2.sh | bash -s +``` + +This command fetches and runs the WasmEdge installation script, which automatically installs WasmEdge and the WASI-NN plugin, essential for running LLM models like Llama 3.1. + +### Step 2: Download the embedding model + +Next, you'll need to obtain a model file. For this tutorial, we're focusing on the **GTW Qwen2 1.5B** model, which is a top rated text embedding model from Qwen. It generates vectors of 1536 dimensions. The steps are generally applicable to other models too. Use the following command to download the model file. + +``` +curl -LO https://huggingface.co/second-state/gte-Qwen2-1.5B-instruct-GGUF/resolve/main/gte-Qwen2-1.5B-instruct-Q5_K_M.gguf +``` + +### Step 3: Download a portable API server app + +Next, you need an application that can build an OpenAI compatible API server for the model. +The [LlamaEdge api server app](https://github.com/LlamaEdge/LlamaEdge/tree/main/llama-api-server) is a lightweight and cross-platform Wasm app that works on any device +you might have. Just download the compiled binary app. + +``` +curl -LO https://github.com/second-state/LlamaEdge/releases/latest/download/llama-api-server.wasm +``` + +> The LlamaEdge apps are written in Rust and compiled to portable Wasm. That means they can run across devices and OSes without any change to the binary apps. You can simply download and run the compiled wasm apps regardless of your platform. + +### Step 4: Start the API server + +Start the API server with the following command. Notice that the context size of this particular embedding model is +32k and the prompt template is `embedding`. + +``` +wasmedge --dir .:. --nn-preload default:GGML:AUTO:gte-Qwen2-1.5B-instruct-Q5_K_M.gguf llama-api-server.wasm --model-name gte-qwen2-1.5b --ctx-size 32768 --batch-size 8192 --ubatch-size 8192 --prompt-template embedding +``` + +### Step 5: Use the /embeddings API + +You can now send embedding requests to it using the OpenAI-compatible `/embeddings` API endpoint. + +``` +curl http://localhost:8080/v1/embeddings \ + -H "Content-Type: application/json" \ + -d '{ + "input": "The food was delicious and the waiter..." + }' +``` + +The response is. + +``` +{"object":"list","data":[{"index":0,"object":"embedding","embedding":[0.02968290634,0.04592291266,0.05229084566,-0.001912750886,-0.01647545397,0.01744602434,0.008423444815,0.01363539882,-0.005849621724,-0.004947130103,-0.02326701023,0.1068811566,0.01074867789, ... 0.005662892945,-0.01796873659,0.02428019233,-0.0333112292]}],"model":"gte-qwen2-1.5b","usage":{"prompt_tokens":9,"completion_tokens":0,"total_tokens":9}} +``` + diff --git a/docs/ai-models/index.md b/docs/ai-models/index.md new file mode 100644 index 0000000..53bbb0e --- /dev/null +++ b/docs/ai-models/index.md @@ -0,0 +1,33 @@ +--- +sidebar_position: 1 +--- + +# Introduction + +LlamaEdge is a versatile platform supporting multiple types of AI models. The most common use of LlamaEdge is to +stand up API servers that can replace OpenAI as your application's backend. + +## πŸ€– Large Language Models (LLM) +Explore the LLM capabilities +βž” [Get Started with LLM](/docs/category/llm) + +## πŸ‘οΈ Multimodal Vision +Work with vision-language models like Llava and Qwen-VL +βž” [Get Started with Multimodal](/docs/category/multimodal) + +## πŸ‘οΈ Mu Embeddings +Work with embedding models for vector and semantic search +βž” [Get Started with Multimodal](/docs/category/mulembeddings) + +## πŸŽ™οΈ Speech to Text +Run speech-to-text models like Whisper +βž” [Get Started with Speech to Text](/docs/category/speech-to-text) + +## πŸ—£οΈ Text to Speech +Convert text-to-speech using models like GPT-SOVITs and Piper +βž” [Get Started with Text to Speech](/docs/category/text-to-speech) + +## 🎨 Text to Image +Generate images using models like Stable Diffusion and FLUX +βž” [Get Started with Text-to-Image](/docs/category/text-to-image) + diff --git a/docs/user-guide/llamaedge-docker.md b/docs/ai-models/llamaedge-docker.md similarity index 100% rename from docs/user-guide/llamaedge-docker.md rename to docs/ai-models/llamaedge-docker.md diff --git a/docs/user-guide/llamaedge-kubernetes.md b/docs/ai-models/llamaedge-kubernetes.md similarity index 100% rename from docs/user-guide/llamaedge-kubernetes.md rename to docs/ai-models/llamaedge-kubernetes.md diff --git a/docs/user-guide/llm/_category_.json b/docs/ai-models/llm/_category_.json similarity index 100% rename from docs/user-guide/llm/_category_.json rename to docs/ai-models/llm/_category_.json diff --git a/docs/user-guide/llm/api-reference.md b/docs/ai-models/llm/api-reference.md similarity index 100% rename from docs/user-guide/llm/api-reference.md rename to docs/ai-models/llm/api-reference.md diff --git a/docs/user-guide/llm/get-started-with-llamaedge.md b/docs/ai-models/llm/quick-start-llm.md similarity index 65% rename from docs/user-guide/llm/get-started-with-llamaedge.md rename to docs/ai-models/llm/quick-start-llm.md index 4d99e33..2791881 100644 --- a/docs/user-guide/llm/get-started-with-llamaedge.md +++ b/docs/ai-models/llm/quick-start-llm.md @@ -40,10 +40,37 @@ curl -LO https://github.com/second-state/LlamaEdge/releases/latest/download/llam > The LlamaEdge apps are written in Rust and compiled to portable Wasm. That means they can run across devices and OSes without any change to the binary apps. You can simply download and run the compiled wasm apps regardless of your platform. -### Step 4: Chat with the chatbot UI -The `llama-api-server.wasm` is a web server with an OpenAI-compatible API. You still need HTML files for the chatbot UI. -Download and unzip the HTML UI files as follows. +### Step 4: Use the API + +Start the web server by running the `llama-api-server.wasm` app in WasmEdge. + +``` +wasmedge --dir .:. --nn-preload default:GGML:AUTO:Llama-3.2-1B-Instruct-Q5_K_M.gguf llama-api-server.wasm -p llama-3-chat +``` + +The `llama-api-server.wasm` is a web server. +You can use the OpenAI-compatible `/chat/completions` API endpoint directly. + +``` +curl -X POST http://localhost:8080/v1/chat/completions \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{"messages":[{"role":"system", "content": "You are a helpful assistant. Try to be as brief as possible."}, {"role":"user", "content": "Where is the capital of Texas?"}]}' +``` + +The response is. + +``` +{"id":"chatcmpl-5f0b5247-7afc-45f8-bc48-614712396a05","object":"chat.completion","created":1751945744,"model":"Mistral-Small-3.1-24B-Instruct-2503-Q5_K_M","choices":[{"index":0,"message":{"content":"The capital of Texas is Austin.","role":"assistant"},"finish_reason":"stop","logprobs":null}],"usage":{"prompt_tokens":38,"completion_tokens":8,"total_tokens":46}} +``` + +### Step 5: Chat with the chatbot UI + +The Chatbot UI is a web app that can interact with the OpenAI-compatible `/chat/completions` API to +provide a human-friendly chatbot in your browser. + +Download and unzip the HTML and JS files for the Chatbot UI as follows. ``` curl -LO https://github.com/LlamaEdge/chatbot-ui/releases/latest/download/chatbot-ui.tar.gz @@ -51,7 +78,7 @@ tar xzf chatbot-ui.tar.gz rm chatbot-ui.tar.gz ``` -Then, start the web server. +Restart the web server to serve those HTML and JS files. ``` wasmedge --dir .:. --nn-preload default:GGML:AUTO:Llama-3.2-1B-Instruct-Q5_K_M.gguf llama-api-server.wasm -p llama-3-chat @@ -59,5 +86,5 @@ wasmedge --dir .:. --nn-preload default:GGML:AUTO:Llama-3.2-1B-Instruct-Q5_K_M.g Go to `http://localhost:8080` on your computer to access the chatbot UI on a web page! -Congratulations! You have now started an LLM app on your own device. But if you are interested in running an agentic app beyond the simple chatbot, you will need to start an API server for this LLM along with the embedding model. Check out [this guide on how to do it](/docs/user-guide/openai-api/intro.md)! +Congratulations! You have now started an LLM app on your own device. diff --git a/docs/user-guide/llm/tool-call.md b/docs/ai-models/llm/tool-call.md similarity index 100% rename from docs/user-guide/llm/tool-call.md rename to docs/ai-models/llm/tool-call.md diff --git a/docs/user-guide/multimodal/_category_.json b/docs/ai-models/multimodal/_category_.json similarity index 100% rename from docs/user-guide/multimodal/_category_.json rename to docs/ai-models/multimodal/_category_.json diff --git a/docs/user-guide/multimodal/gemma-3.md b/docs/ai-models/multimodal/gemma-3.md similarity index 100% rename from docs/user-guide/multimodal/gemma-3.md rename to docs/ai-models/multimodal/gemma-3.md diff --git a/docs/user-guide/multimodal/llava.md b/docs/ai-models/multimodal/llava.md similarity index 100% rename from docs/user-guide/multimodal/llava.md rename to docs/ai-models/multimodal/llava.md diff --git a/docs/user-guide/multimodal/medgemma-4b.md b/docs/ai-models/multimodal/medgemma-4b.md similarity index 100% rename from docs/user-guide/multimodal/medgemma-4b.md rename to docs/ai-models/multimodal/medgemma-4b.md diff --git a/docs/user-guide/multimodal/medgemma.jpg b/docs/ai-models/multimodal/medgemma.jpg similarity index 100% rename from docs/user-guide/multimodal/medgemma.jpg rename to docs/ai-models/multimodal/medgemma.jpg diff --git a/docs/user-guide/multimodal/qwen2-5.md b/docs/ai-models/multimodal/qwen2-5.md similarity index 100% rename from docs/user-guide/multimodal/qwen2-5.md rename to docs/ai-models/multimodal/qwen2-5.md diff --git a/docs/user-guide/speech-to-text/_category_.json b/docs/ai-models/speech-to-text/_category_.json similarity index 100% rename from docs/user-guide/speech-to-text/_category_.json rename to docs/ai-models/speech-to-text/_category_.json diff --git a/docs/user-guide/speech-to-text/api-reference.md b/docs/ai-models/speech-to-text/api-reference.md similarity index 100% rename from docs/user-guide/speech-to-text/api-reference.md rename to docs/ai-models/speech-to-text/api-reference.md diff --git a/docs/user-guide/speech-to-text/cli.md b/docs/ai-models/speech-to-text/cli.md similarity index 100% rename from docs/user-guide/speech-to-text/cli.md rename to docs/ai-models/speech-to-text/cli.md diff --git a/docs/user-guide/speech-to-text/quick-start-whisper.md b/docs/ai-models/speech-to-text/quick-start-whisper.md similarity index 100% rename from docs/user-guide/speech-to-text/quick-start-whisper.md rename to docs/ai-models/speech-to-text/quick-start-whisper.md diff --git a/docs/user-guide/text-to-image/_category_.json b/docs/ai-models/text-to-image/_category_.json similarity index 100% rename from docs/user-guide/text-to-image/_category_.json rename to docs/ai-models/text-to-image/_category_.json diff --git a/docs/user-guide/text-to-image/flux.md b/docs/ai-models/text-to-image/flux.md similarity index 100% rename from docs/user-guide/text-to-image/flux.md rename to docs/ai-models/text-to-image/flux.md diff --git a/docs/user-guide/text-to-image/quick-start-sd.md b/docs/ai-models/text-to-image/quick-start-sd.md similarity index 100% rename from docs/user-guide/text-to-image/quick-start-sd.md rename to docs/ai-models/text-to-image/quick-start-sd.md diff --git a/docs/user-guide/text-to-speech/_category_.json b/docs/ai-models/text-to-speech/_category_.json similarity index 100% rename from docs/user-guide/text-to-speech/_category_.json rename to docs/ai-models/text-to-speech/_category_.json diff --git a/docs/user-guide/text-to-speech/gpt-sovits.md b/docs/ai-models/text-to-speech/gpt-sovits.md similarity index 100% rename from docs/user-guide/text-to-speech/gpt-sovits.md rename to docs/ai-models/text-to-speech/gpt-sovits.md diff --git a/docs/developer-guide/_category_.json b/docs/developer-guide/_category_.json deleted file mode 100644 index c7b1d58..0000000 --- a/docs/developer-guide/_category_.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "label": "Developer Guide", - "position": 6, - "link": { - "type": "generated-index", - "description": "Developer your own LLM applications on the top of LlamaEdge." - } -} diff --git a/docs/developer-guide/create-embeddings-collection.md b/docs/developer-guide/create-embeddings-collection.md deleted file mode 100644 index 6dd2277..0000000 --- a/docs/developer-guide/create-embeddings-collection.md +++ /dev/null @@ -1,156 +0,0 @@ ---- -sidebar_position: 5 ---- - -# Create knowledge embeddings using the API server - -The LlamaEdge API server project demonstrates how to support OpenAI style APIs to upload, chunck, and create embeddings for a text document. In this guide, I will show you how to use those API endpoints as a developer. - -> This article is intended to demonstrate capabilities of the open source API server example. You should review the API server source code to learn how those features are implemented. If you are running an RAG application with the API server, check out [this guide](../user-guide/llm/server-side-rag/quick-start). - -## Build the API server - -Check out the source code and build it using Rust `cargo` tools. - -``` -git clone https://github.com/LlamaEdge/LlamaEdge - -cd LlamaEdge/api-server -cargo build --target wasm32-wasip1 --release -``` - -The `llama-api-server.wasm` file is in the `target` directory. - -``` -cp target/wasm32-wasip1/release/llama-api-server.wasm . -``` - -## Download models - -We will need an LLM and a specialized embedding model. While the LLM technically can create embeddings, specialized embedding models can do it much much better. - -``` -# The chat model is Llama2 7b chat -curl -LO https://huggingface.co/second-state/Llama-2-7B-Chat-GGUF/resolve/main/Llama-2-7b-chat-hf-Q5_K_M.gguf - -# The embedding model is all-MiniLM-L6-v2 -curl -LO https://huggingface.co/second-state/All-MiniLM-L6-v2-Embedding-GGUF/resolve/main/all-MiniLM-L6-v2-ggml-model-f16.gguf -``` - -## Start the API server - -We will now start the API server with both models. The LLM is named `default` and the embedding model is named `embedding`. They each have an external facing model name in the `--model-name` argument. - -``` -wasmedge --dir .:. \ - --nn-preload default:GGML:AUTO:Llama-2-7b-chat-hf-Q5_K_M.gguf \ - --nn-preload embedding:GGML:AUTO:all-MiniLM-L6-v2-ggml-model-f16.gguf \ - llama-api-server.wasm -p llama-2-chat,embedding --web-ui ./chatbot-ui \ - --model-name Llama-2-7b-chat-hf-Q5_K_M,all-MiniLM-L6-v2-ggml-model-f16 \ - --ctx-size 4096,384 \ - --log-prompts --log-stat -``` - -## Create the embeddings - -First, we use the `/files` API to upload a file `paris.txt` to the API server. - -``` -curl -X POST http://127.0.0.1:8080/v1/files -F "file=@paris.txt" -``` - -If the command is successful, you should see the similar output as below in your terminal. - -``` -{ - "id": "file_4bc24593-2a57-4646-af16-028855e7802e", - "bytes": 2161, - "created_at": 1711611801, - "filename": "paris.txt", - "object": "file", - "purpose": "assistants" -} -``` - -Next, take the `id` and request the `/chunks` API to chunk the file `paris.txt` into smaller pieces. The reason is that each embedding vector can only hold limited amount of information. The embedding model can "understand" the file content, and determine the optimistic places to break up the text into chunks. - -``` -curl -X POST http://localhost:8080/v1/chunks \ - -H 'accept:application/json' \ - -H 'Content-Type: application/json' \ - -d '{"id":"file_4bc24593-2a57-4646-af16-028855e7802e", "filename":"paris.txt"}' -``` - -The following is an example return with the generated chunks. - -``` -{ - "id": "file_4bc24593-2a57-4646-af16-028855e7802e", - "filename": "paris.txt", - "chunks": [ - "Paris, city and capital of France, ..., for Paris has retained its importance as a centre for education and intellectual pursuits.", - "Paris’s site at a crossroads ..., drawing to itself much of the talent and vitality of the provinces." - ] -} -``` - -Finally, use the `/embeddings` API to generate the embedding vectors. Make sure that you pass in the embedding model name. - -```bash -curl -X POST http://localhost:8080/v1/embeddings \ - -H 'accept:application/json' \ - -H 'Content-Type: application/json' \ - -d '{"model": "all-MiniLM-L6-v2-ggml-model-f16", "input":["Paris, city and capital of France, ..., for Paris has retained its importance as a centre for education and intellectual pursuits.", "Paris’s site at a crossroads ..., drawing to itself much of the talent and vitality of the provinces."]}' -``` - -The embeddings returned are like below. - -```json -{ - "object": "list", - "data": [ - { - "index": 0, - "object": "embedding", - "embedding": [ - 0.1428378969, - -0.0447309874, - 0.007660218049, - ... - -0.0128974719, - -0.03543198109, - 0.03974733502, - 0.00946635101, - -0.01531364303 - ] - }, - { - "index": 1, - "object": "embedding", - "embedding": [ - 0.0697753951, - -0.0001159032545, - 0.02073983476, - ... - 0.03565846011, - -0.04550019652, - 0.02691745944, - 0.02498772368, - -0.003226313973 - ] - } - ], - "model": "all-MiniLM-L6-v2-ggml-model-f16", - "usage": { - "prompt_tokens": 491, - "completion_tokens": 0, - "total_tokens": 491 - } -} -``` - -## Next step - -Once you have the embeddings in a JSON file, you can store them into a vector database. It will probably require you to write a script to combine each vector point with its corresponding source text, and then upsert into the database's vector collection. This step will be specific to the vector database and RAG strategy you choose. - - diff --git a/docs/developer-guide/multimodal-app.md b/docs/developer-guide/multimodal-app.md deleted file mode 100644 index 0738647..0000000 --- a/docs/developer-guide/multimodal-app.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -sidebar_position: 3 ---- - -# Create a multimodal app - -Coming soon. \ No newline at end of file diff --git a/docs/developer-guide/rag-api-server.md b/docs/developer-guide/rag-api-server.md deleted file mode 100644 index e9aae59..0000000 --- a/docs/developer-guide/rag-api-server.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -sidebar_position: 5 ---- - -# Implement your own RAG API server - -Coming soon. \ No newline at end of file diff --git a/docs/inference-sdk/_category_.json b/docs/inference-sdk/_category_.json new file mode 100644 index 0000000..171cef0 --- /dev/null +++ b/docs/inference-sdk/_category_.json @@ -0,0 +1,8 @@ +{ + "label": "Embed AI models", + "position": 7, + "link": { + "type": "generated-index", + "description": "Embed AI models, such as LLMs, into your own apps using the LlamaEdge inference SDK." + } +} diff --git a/docs/developer-guide/basic-llm-app.md b/docs/inference-sdk/basic-llm-app.md similarity index 100% rename from docs/developer-guide/basic-llm-app.md rename to docs/inference-sdk/basic-llm-app.md diff --git a/docs/developer-guide/chatbot-llm-app.md b/docs/inference-sdk/chatbot-llm-app.md similarity index 100% rename from docs/developer-guide/chatbot-llm-app.md rename to docs/inference-sdk/chatbot-llm-app.md diff --git a/docs/developer-guide/embedding-app.md b/docs/inference-sdk/embedding-app.md similarity index 100% rename from docs/developer-guide/embedding-app.md rename to docs/inference-sdk/embedding-app.md diff --git a/docs/llama-nexus/_category_.json b/docs/llama-nexus/_category_.json index c9d70ca..a82349c 100644 --- a/docs/llama-nexus/_category_.json +++ b/docs/llama-nexus/_category_.json @@ -1,7 +1,8 @@ { - "label": "Llama Nexus", - "position": 7, + "label": "Agents and apps", + "position": 6, "link": { - "type": "generated-index" + "type": "generated-index", + "description": "Combine AI models, tools, an MCP servers into a single API server to enable agents and apps." } } diff --git a/docs/llama-nexus/llama-nexus.md b/docs/llama-nexus/llama-nexus.md index 00b7dde..2a00494 100644 --- a/docs/llama-nexus/llama-nexus.md +++ b/docs/llama-nexus/llama-nexus.md @@ -7,6 +7,7 @@ sidebar_position: 1 Llama-Nexus is a gateway service designed to manage and orchestrate OpenAI-compatible API servers. Key Features + * Unified Interface: Provides a single interface to access various AI services, including chat completions, audio processing, image generation, and text-to-speech. These services can be powered by models running locally or provided by third-party SaaS platforms. * Built-in MCP Client: Acts as an MCP client capable of connecting to an MCP server and communicating with LLM hosts. diff --git a/docs/llama-nexus/mcp/_category_.json b/docs/llama-nexus/mcp/_category_.json index d5b4513..ec75783 100644 --- a/docs/llama-nexus/mcp/_category_.json +++ b/docs/llama-nexus/mcp/_category_.json @@ -1,7 +1,8 @@ { - "label": "MCP", + "label": "Working with MCP servers", "position": 3, "link": { - "type": "generated-index" + "type": "generated-index", + "description": "Adding MCP servers into Llama Nexus." } } diff --git a/docs/llama-nexus/mcp/quick-start-with-mcp.md b/docs/llama-nexus/mcp/quick-start-with-mcp.md index 49d1bc9..55d4f92 100644 --- a/docs/llama-nexus/mcp/quick-start-with-mcp.md +++ b/docs/llama-nexus/mcp/quick-start-with-mcp.md @@ -2,7 +2,7 @@ sidebar_position: 4 --- -# Qucik start with the MCP support +# Quick start with MCP servers One of the key features of Llama-Nexus is its built-in MCP Client, which allows you to use Llama-Nexus for MCP-related tasks just like Claude Desktop and Cursor. @@ -115,6 +115,7 @@ curl -X POST http://localhost:9095/v1/chat/completions \ ``` Expected output: + ```json { "id": "chatcmpl-cf63660e-3494-472c-b4d0-6cda72e1f8e9", @@ -138,4 +139,4 @@ Expected output: "total_tokens": 94 } } -``` \ No newline at end of file +``` diff --git a/docs/user-guide/openai-api/_category_.json b/docs/llama-nexus/openai-api/_category_.json similarity index 100% rename from docs/user-guide/openai-api/_category_.json rename to docs/llama-nexus/openai-api/_category_.json diff --git a/docs/user-guide/openai-api/agent-zero-01.png b/docs/llama-nexus/openai-api/agent-zero-01.png similarity index 100% rename from docs/user-guide/openai-api/agent-zero-01.png rename to docs/llama-nexus/openai-api/agent-zero-01.png diff --git a/docs/user-guide/openai-api/agent-zero-02.png b/docs/llama-nexus/openai-api/agent-zero-02.png similarity index 100% rename from docs/user-guide/openai-api/agent-zero-02.png rename to docs/llama-nexus/openai-api/agent-zero-02.png diff --git a/docs/user-guide/openai-api/agent-zero-03.png b/docs/llama-nexus/openai-api/agent-zero-03.png similarity index 100% rename from docs/user-guide/openai-api/agent-zero-03.png rename to docs/llama-nexus/openai-api/agent-zero-03.png diff --git a/docs/user-guide/openai-api/agent-zero-04.png b/docs/llama-nexus/openai-api/agent-zero-04.png similarity index 100% rename from docs/user-guide/openai-api/agent-zero-04.png rename to docs/llama-nexus/openai-api/agent-zero-04.png diff --git a/docs/user-guide/openai-api/agent-zero-05.png b/docs/llama-nexus/openai-api/agent-zero-05.png similarity index 100% rename from docs/user-guide/openai-api/agent-zero-05.png rename to docs/llama-nexus/openai-api/agent-zero-05.png diff --git a/docs/user-guide/openai-api/agent-zero-06.png b/docs/llama-nexus/openai-api/agent-zero-06.png similarity index 100% rename from docs/user-guide/openai-api/agent-zero-06.png rename to docs/llama-nexus/openai-api/agent-zero-06.png diff --git a/docs/user-guide/openai-api/agent-zero-07.png b/docs/llama-nexus/openai-api/agent-zero-07.png similarity index 100% rename from docs/user-guide/openai-api/agent-zero-07.png rename to docs/llama-nexus/openai-api/agent-zero-07.png diff --git a/docs/user-guide/openai-api/agent-zero-08.png b/docs/llama-nexus/openai-api/agent-zero-08.png similarity index 100% rename from docs/user-guide/openai-api/agent-zero-08.png rename to docs/llama-nexus/openai-api/agent-zero-08.png diff --git a/docs/user-guide/openai-api/agent-zero-09.png b/docs/llama-nexus/openai-api/agent-zero-09.png similarity index 100% rename from docs/user-guide/openai-api/agent-zero-09.png rename to docs/llama-nexus/openai-api/agent-zero-09.png diff --git a/docs/user-guide/openai-api/agent-zero.md b/docs/llama-nexus/openai-api/agent-zero.md similarity index 100% rename from docs/user-guide/openai-api/agent-zero.md rename to docs/llama-nexus/openai-api/agent-zero.md diff --git a/docs/user-guide/openai-api/continue-01.png b/docs/llama-nexus/openai-api/continue-01.png similarity index 100% rename from docs/user-guide/openai-api/continue-01.png rename to docs/llama-nexus/openai-api/continue-01.png diff --git a/docs/user-guide/openai-api/continue.md b/docs/llama-nexus/openai-api/continue.md similarity index 100% rename from docs/user-guide/openai-api/continue.md rename to docs/llama-nexus/openai-api/continue.md diff --git a/docs/user-guide/openai-api/flowise-tool-01.png b/docs/llama-nexus/openai-api/flowise-tool-01.png similarity index 100% rename from docs/user-guide/openai-api/flowise-tool-01.png rename to docs/llama-nexus/openai-api/flowise-tool-01.png diff --git a/docs/user-guide/openai-api/flowise-tool-02.png b/docs/llama-nexus/openai-api/flowise-tool-02.png similarity index 100% rename from docs/user-guide/openai-api/flowise-tool-02.png rename to docs/llama-nexus/openai-api/flowise-tool-02.png diff --git a/docs/user-guide/openai-api/flowise-tool-03.png b/docs/llama-nexus/openai-api/flowise-tool-03.png similarity index 100% rename from docs/user-guide/openai-api/flowise-tool-03.png rename to docs/llama-nexus/openai-api/flowise-tool-03.png diff --git a/docs/user-guide/openai-api/flowise-tool-04.png b/docs/llama-nexus/openai-api/flowise-tool-04.png similarity index 100% rename from docs/user-guide/openai-api/flowise-tool-04.png rename to docs/llama-nexus/openai-api/flowise-tool-04.png diff --git a/docs/user-guide/openai-api/flowise-tool-05.png b/docs/llama-nexus/openai-api/flowise-tool-05.png similarity index 100% rename from docs/user-guide/openai-api/flowise-tool-05.png rename to docs/llama-nexus/openai-api/flowise-tool-05.png diff --git a/docs/user-guide/openai-api/flowise-tool-06.png b/docs/llama-nexus/openai-api/flowise-tool-06.png similarity index 100% rename from docs/user-guide/openai-api/flowise-tool-06.png rename to docs/llama-nexus/openai-api/flowise-tool-06.png diff --git a/docs/user-guide/openai-api/flowiseai-tool-call.md b/docs/llama-nexus/openai-api/flowiseai-tool-call.md similarity index 100% rename from docs/user-guide/openai-api/flowiseai-tool-call.md rename to docs/llama-nexus/openai-api/flowiseai-tool-call.md diff --git a/docs/user-guide/openai-api/img/docsVersionDropdown.png b/docs/llama-nexus/openai-api/img/docsVersionDropdown.png similarity index 100% rename from docs/user-guide/openai-api/img/docsVersionDropdown.png rename to docs/llama-nexus/openai-api/img/docsVersionDropdown.png diff --git a/docs/user-guide/openai-api/img/localeDropdown.png b/docs/llama-nexus/openai-api/img/localeDropdown.png similarity index 100% rename from docs/user-guide/openai-api/img/localeDropdown.png rename to docs/llama-nexus/openai-api/img/localeDropdown.png diff --git a/docs/user-guide/openai-api/intro.md b/docs/llama-nexus/openai-api/intro.md similarity index 100% rename from docs/user-guide/openai-api/intro.md rename to docs/llama-nexus/openai-api/intro.md diff --git a/docs/user-guide/openai-api/langchain.md b/docs/llama-nexus/openai-api/langchain.md similarity index 100% rename from docs/user-guide/openai-api/langchain.md rename to docs/llama-nexus/openai-api/langchain.md diff --git a/docs/user-guide/openai-api/lobechat-llamaedge-01.png b/docs/llama-nexus/openai-api/lobechat-llamaedge-01.png similarity index 100% rename from docs/user-guide/openai-api/lobechat-llamaedge-01.png rename to docs/llama-nexus/openai-api/lobechat-llamaedge-01.png diff --git a/docs/user-guide/openai-api/lobechat-llamaedge-02.png b/docs/llama-nexus/openai-api/lobechat-llamaedge-02.png similarity index 100% rename from docs/user-guide/openai-api/lobechat-llamaedge-02.png rename to docs/llama-nexus/openai-api/lobechat-llamaedge-02.png diff --git a/docs/user-guide/openai-api/lobechat.md b/docs/llama-nexus/openai-api/lobechat.md similarity index 100% rename from docs/user-guide/openai-api/lobechat.md rename to docs/llama-nexus/openai-api/lobechat.md diff --git a/docs/user-guide/openai-api/obsidian-configure.png b/docs/llama-nexus/openai-api/obsidian-configure.png similarity index 100% rename from docs/user-guide/openai-api/obsidian-configure.png rename to docs/llama-nexus/openai-api/obsidian-configure.png diff --git a/docs/user-guide/openai-api/obsidian-enable.png b/docs/llama-nexus/openai-api/obsidian-enable.png similarity index 100% rename from docs/user-guide/openai-api/obsidian-enable.png rename to docs/llama-nexus/openai-api/obsidian-enable.png diff --git a/docs/user-guide/openai-api/obsidian-extract.png b/docs/llama-nexus/openai-api/obsidian-extract.png similarity index 100% rename from docs/user-guide/openai-api/obsidian-extract.png rename to docs/llama-nexus/openai-api/obsidian-extract.png diff --git a/docs/user-guide/openai-api/obsidian-grammar.png b/docs/llama-nexus/openai-api/obsidian-grammar.png similarity index 100% rename from docs/user-guide/openai-api/obsidian-grammar.png rename to docs/llama-nexus/openai-api/obsidian-grammar.png diff --git a/docs/user-guide/openai-api/obsidian-hotkey.png b/docs/llama-nexus/openai-api/obsidian-hotkey.png similarity index 100% rename from docs/user-guide/openai-api/obsidian-hotkey.png rename to docs/llama-nexus/openai-api/obsidian-hotkey.png diff --git a/docs/user-guide/openai-api/obsidian-model.png b/docs/llama-nexus/openai-api/obsidian-model.png similarity index 100% rename from docs/user-guide/openai-api/obsidian-model.png rename to docs/llama-nexus/openai-api/obsidian-model.png diff --git a/docs/user-guide/openai-api/obsidian-summarization.png b/docs/llama-nexus/openai-api/obsidian-summarization.png similarity index 100% rename from docs/user-guide/openai-api/obsidian-summarization.png rename to docs/llama-nexus/openai-api/obsidian-summarization.png diff --git a/docs/user-guide/openai-api/obsidian-text-continuation.png b/docs/llama-nexus/openai-api/obsidian-text-continuation.png similarity index 100% rename from docs/user-guide/openai-api/obsidian-text-continuation.png rename to docs/llama-nexus/openai-api/obsidian-text-continuation.png diff --git a/docs/user-guide/openai-api/obsidian.md b/docs/llama-nexus/openai-api/obsidian.md similarity index 100% rename from docs/user-guide/openai-api/obsidian.md rename to docs/llama-nexus/openai-api/obsidian.md diff --git a/docs/user-guide/openai-api/translation-agent.md b/docs/llama-nexus/openai-api/translation-agent.md similarity index 100% rename from docs/user-guide/openai-api/translation-agent.md rename to docs/llama-nexus/openai-api/translation-agent.md diff --git a/docs/user-guide/openai-api/translation-agent.png b/docs/llama-nexus/openai-api/translation-agent.png similarity index 100% rename from docs/user-guide/openai-api/translation-agent.png rename to docs/llama-nexus/openai-api/translation-agent.png diff --git a/docs/llama-nexus/quick-start.md b/docs/llama-nexus/quick-start.md index 3718024..94c20a4 100644 --- a/docs/llama-nexus/quick-start.md +++ b/docs/llama-nexus/quick-start.md @@ -30,7 +30,7 @@ By default, Llama-Nexus listens on port `3389`. Assuming you already have an OpenAI-compatible API server for your LLM, letβ€˜s register it with Llama-Nexus. -If you’d like to run a model locally, refer to the [Quick Start with LlamaEdge](../user-guide/llm/get-started-with-llamaedge.md) guide. +If you'd like to run a model locally, refer to the [Quick Start with LLM](../ai-models/llm/quick-start-llm.md) guide. Register the LLM chat API server for the `/chat/completions` endpoint: @@ -57,4 +57,4 @@ curl -X POST http://localhost:9095/v1/chat/completions \ -H 'accept: application/json' \ -H 'Content-Type: application/json' \ -d '{"messages":[{"role":"system", "content": "You are a helpful assistant."},{"role":"user", "content": "What is the weather in Singapore?"}]}' -``` \ No newline at end of file +``` diff --git a/docs/llama-nexus/register.md b/docs/llama-nexus/register.md index 9c2de31..5e22750 100644 --- a/docs/llama-nexus/register.md +++ b/docs/llama-nexus/register.md @@ -2,13 +2,16 @@ sidebar_position: 3 --- -# Register and Unregister +# Register API services -Llama Nexus supports all common LLM services. Your API service must follow OpenAI specifications. +You can add almost any OpenAI-compatible API services to the Llama-Nexus gateway. +In this chapter, we demonstrate how various LlamaEdge API servers could be registered +under a single Llama Nexus gateway. This gateway will be able to provide all OpenAI API +endpoints supported by these registered API servers. ## Prerequisites -- Llama Nexus server running (default port: 9095) +- Llama Nexus server running (default port: 3389) - Target services running and accessible - Services implementing OpenAI-compatible APIs @@ -79,12 +82,8 @@ curl --location 'http://localhost:3389/admin/servers/register' \ }' ``` -## Unregister - -To be add - ## Notes - All services must implement OpenAI-compatible APIs - URLs can be local (`http://localhost:port`) or remote (`https://domain.com`) -- Services are automatically load-balanced if multiple instances of the same kind are registered \ No newline at end of file +- Services are automatically load-balanced if multiple instances of the same kind are registered diff --git a/docs/user-guide/index.md b/docs/user-guide/index.md deleted file mode 100644 index 693ad9f..0000000 --- a/docs/user-guide/index.md +++ /dev/null @@ -1,29 +0,0 @@ ---- -sidebar_position: 1 ---- - -# LlamaEdge - -LlamaEdge is a versatile platform supporting multiple types of AI models and applications: - -## Model Categories - -### πŸ€– Large Language Models (LLM) -Explore the LLM capabilities and RAG applications -βž” [Get Started with LLM](/docs/category/llm) - -### πŸŽ™οΈ Speech to Text -Run speech recognition models like Whisper -βž” [Get Started with Speech to Text](/docs/category/speech-to-text) - -### πŸ—£οΈ Text to Speech -Convert text to speech using models like GPT-SOVITs and Piper -βž” [Get Started with Text to Speech](/docs/category/text-to-speech) - -### 🎨 Text to Image -Generate images using models like Stable Diffusion and FLUX -βž” [Get Started with Text-to-Image](/docs/category/text-to-image) - -### πŸ‘οΈ Multimodal Vision -Work with vision-language models like Llava and Qwen-VL -βž” [Get Started with Multimodal](/docs/category/multimodal) \ No newline at end of file diff --git a/docs/user-guide/llm/full-openai.md b/docs/user-guide/llm/full-openai.md deleted file mode 100644 index b7f774a..0000000 --- a/docs/user-guide/llm/full-openai.md +++ /dev/null @@ -1,75 +0,0 @@ ---- -sidebar_position: 3 ---- - -# Start an OpenAI compatible API server - -LlamaEdge support running LLMs along with embbedding models, allowing you to start a drop-in replacement for OpenAI API. - -### Step 1: Install WasmEdge - -First off, you'll need WasmEdge. To install WasmEdge along with the necessary plugin for AI inference, open your terminal and execute the following command: - -``` -curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install_v2.sh | bash -s -``` - -This command fetches and runs the WasmEdge installation script, which automatically installs WasmEdge and the WASI-NN plugin, essential for running LLM models like Llama 3.1 and Nomix-embed models. - -### Step 2: Download the LLM Model and Embedding Model - -Next, you'll need to obtain a model file. For this tutorial, we're focusing on the **Llama 3.2 1B model finetuned for instruction following and Nomic embed model**, but the steps are generally applicable to other models too. Use the following command to download the model files. - -``` -# The chat model is Llama 3.2 1b chat -curl -LO https://huggingface.co/second-state/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q5_K_M.gguf - -# The embedding model is nomic-embed-text-v1.5 -curl -LO https://huggingface.co/second-state/Nomic-embed-text-v1.5-Embedding-GGUF/resolve/main/nomic-embed-text-v1.5-f16.gguf -``` - -This command downloads the Llama-3.2-1B-Instruct model and nomic-embed-text-v1.5 model from Huggingface, an AI model hosting platform. - -### Step 3: Download a Portable OpenAI Compatible Server - -To start an OpenAI-compatible API server, you need the [LlamaEdge API server](https://github.com/LlamaEdge/LlamaEdge/tree/main/api-server) app. - -``` -curl -LO https://github.com/second-state/LlamaEdge/releases/latest/download/llama-api-server.wasm -``` - -The `llama-api-server.wasm` is a web server with an OpenAI-compatible API. - -> The LlamaEdge apps are written in Rust and compiled to portable Wasm. That means they can run across devices and OSes without any change to the binary apps. You can simply download and run the compiled wasm apps regardless of your platform. - -### Step 4: Start the API Server - -With everything set up, it's time to run the models as follows. - -``` -wasmedge --dir .:. \ - --nn-preload default:GGML:AUTO:Llama-3.2-1B-Instruct-Q5_K_M.gguf \ - --nn-preload embedding:GGML:AUTO:nomic-embed-text-v1.5-f16.gguf \ - llama-api-server.wasm -p llama-3-chat,embedding \ - --model-name Llama-3.2-1B-Instruct-Q5_K_M,nomic-embed-text-v1.5-f16 \ - --ctx-size 8192,8192 \ - --batch-size 128,8192 \ - --log-prompts --log-stat -``` - -This command executes the chat application, allowing you to start interacting with the Llama 3 8B model. Here, `wasmedge` is the command to run the WasmEdge runtime, `--nn-preload` specifies the model to use with the WASI-NN plugin, and `-p` sets the prompt template for the chat. - -### Step 5: Send an API Request - -Now you have a drop-in replacement for OpenAI API. You can integrate it with any agents/frameworks based on OpenAI. - -|Config option | Value | -|-----|--------| -| API endpoint URL | http://localhost:8080/v1 | -| Model Name (for LLM) | Llama-3.2-1B-Instruct-Q5_K_M | -| Model Name (for Text embedding) | nomic-embed-text-v1.5-f16 | -| API key | Empty or any value | - - -Congratulations! Next, you can integrate your APT server with [OpenAI ecosystem apps](/docs/category/ecosystem-apps). - diff --git a/docs/user-guide/llm/server-side-rag/_category_.json b/docs/user-guide/llm/server-side-rag/_category_.json deleted file mode 100644 index 5a9e6d8..0000000 --- a/docs/user-guide/llm/server-side-rag/_category_.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "label": "Long-term memory and knowledge", - "position": 20, - "link": { - "type": "generated-index" - } -} diff --git a/docs/user-guide/llm/server-side-rag/markdown.md b/docs/user-guide/llm/server-side-rag/markdown.md deleted file mode 100644 index d93886c..0000000 --- a/docs/user-guide/llm/server-side-rag/markdown.md +++ /dev/null @@ -1,106 +0,0 @@ ---- -sidebar_position: 3 ---- - -# Knowledge base from a markdown file - -In this section, we will discuss how to create a vector collection snapshot from a markdown file. The -snapshot file can then be loaded by a GaiaNet node as its knowledge base. -You will have the option to create a vector for each markdown section. - -## Prerequisites - -Install the WasmEdge Runtime, the cross-platform LLM runtime. - -``` -curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install_v2.sh | bash -s -``` - -Download an embedding model. - -``` -curl -LO https://huggingface.co/gaianet/Nomic-embed-text-v1.5-Embedding-GGUF/resolve/main/nomic-embed-text-v1.5.f16.gguf -``` - -The embedding model is a special kind of LLM that turns sentences into vectors. The vectors can then be stored in a vector database and searched later. When the sentences are from a body of text that represents a knowledge domain, that vector database becomes our RAG knowledge base. - -## Start a vector database - -By default, we use Qdrant as the vector database. You can start a Qdrant instance on your server using Docker. The following command starts it in the background. - -``` -mkdir qdrant_storage -mkdir qdrant_snapshots - -nohup docker run -d -p 6333:6333 -p 6334:6334 \ - -v $(pwd)/qdrant_storage:/qdrant/storage:z \ - -v $(pwd)/qdrant_snapshots:/qdrant/snapshots:z \ - qdrant/qdrant -``` - -## Create the vector collection snapshot - -Delete the default collection if it exists. - -``` -curl -X DELETE 'http://localhost:6333/collections/default' -``` - -Create a new collection called default. Notice that it is 768 dimensions. That is the output vector size of the embedding model `nomic-embed-text-v1.5`. If you are using a different embedding model, you should use a dimension that fits the model. - -``` -curl -X PUT 'http://localhost:6333/collections/default' \ - -H 'Content-Type: application/json' \ - --data-raw '{ - "vectors": { - "size": 768, - "distance": "Cosine", - "on_disk": true - } - }' -``` - -Download a program to chunk a document and create embeddings. - -``` -curl -LO https://github.com/GaiaNet-AI/embedding-tools/raw/main/markdown_embed/markdown_embed.wasm -``` - -It chunks the document based on markdown sections. You can check out the [Rust source code](https://github.com/GaiaNet-AI/embedding-tools/tree/main/markdown_embed) here and modify it if you need to use a different chunking strategy. - -Next, you can run the program by passing a collection name, vector dimension, and the source document. You can pass in the desired markdown heading level for chunking using the `--heading_level` option. The `--ctx_size` option matches the embedding model's context window size, which in this case is 8192 tokens allowing it to process long sections of text. Make sure that Qdrant is running on your local machine. The model is preloaded under the name embedding. The wasm app then uses the embedding model to create the 768-dimension vectors from `paris.md` and saves them into the default collection. - -``` -curl -LO https://huggingface.co/datasets/gaianet/paris/raw/main/paris.md - -wasmedge --dir .:. \ - --nn-preload embedding:GGML:AUTO:nomic-embed-text-v1.5.f16.gguf \ - markdown_embed.wasm embedding default 768 paris.md --heading_level 1 --ctx_size 8192 -``` - -## More options - -You can pass the following options to the program. - -* Using `-c` or `--ctx_size` to specify the context size of the input. This defaults to 512. -* Using `-l` or `--heading_level` to specify the markdown heading level for each vector. This defaults to 1. -* Using `-m` or `--maximum_context_length` to specify a context length in the CLI argument. That is to truncate and warn for each text segment that goes above the context length. -* Using `-s` or `--start_vector_id` to specify the start vector ID in the CLI argument. This will allow us to run this app multiple times on multiple documents on the same vector collection. - -Example: the above example but to append the London guide to the end of an existing collection starting from index 42. - -``` -wasmedge --dir .:. \ - --nn-preload embedding:GGML:AUTO:nomic-embed-text-v1.5.f16.gguf \ - markdown_embed.wasm embedding default 768 london.md -c 8192 -l 1 -s 42 -``` - -## Create a vector snapshot - -You can create a snapshot of the collection, which can be shared and loaded into a different Qdrant database. You can find the snapshot file in the `qdrant_snapshots` directory. - -``` -curl -X POST 'http://localhost:6333/collections/default/snapshots' -``` - -Have fun! diff --git a/docs/user-guide/llm/server-side-rag/quick-start.md b/docs/user-guide/llm/server-side-rag/quick-start.md deleted file mode 100644 index 47c69fb..0000000 --- a/docs/user-guide/llm/server-side-rag/quick-start.md +++ /dev/null @@ -1,159 +0,0 @@ ---- -sidebar_position: 1 ---- - -# Long-term memory for the LLM - -The LLM app requires both long-term and short-term memories. Long-term memory includes factual knowledge, historical facts, background stories etc. They are best added to the context as complete chapters instead of small chunks of text to maintain the internal consistency of the knowledge. - -[RAG](https://blogs.nvidia.com/blog/what-is-retrieval-augmented-generation/) -is an important technique to inject contextual knowledge into an LLM application. It improves accuracy and reduces the hallucination of LLMs. -An effective RAG application combines real-time and user-specific short-term memory (chunks) with stable long-term memory (chapters) in the prompt context. - -Since the application's long-term memory is stable (even immutable), we package it in a vector database tightly coupled with the LLM. The client app assembles the short-term memory in the prompt and is supplemented with the long-term memory on the LLM server. We call the approach "server-side RAG". - -> The long context length supported by modern LLMs are especially well suited for long term knowledge that are best represented by chapters of text. - -The LlamaEdge API server provides application components that developers can reuse to -supplement the LLM with long-term memories. -We have built this feature into the [rag-api-server](https://github.com/LlamaEdge/rag-api-server) project. -The result is an OpenAI -compatible LLM service that is grounded by long-term knowledge on the server side. The client application -can simply chat with it or provide realtime / short-term memory since the LLM is already aware of the -domain or background. - -## Prerequisites - -Install the [WasmEdge Runtime](https://github.com/WasmEdge/WasmEdge), our cross-platform LLM runtime. - -``` -curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install_v2.sh | bash -s -``` - -Download the pre-built binary for the LlamaEdge API server with RAG support. - -``` -curl -LO https://github.com/LlamaEdge/rag-api-server/releases/latest/download/rag-api-server.wasm -``` - -And the chatbot web UI for the API server. - -``` -curl -LO https://github.com/second-state/chatbot-ui/releases/latest/download/chatbot-ui.tar.gz -tar xzf chatbot-ui.tar.gz -rm chatbot-ui.tar.gz -``` - -Download a chat model and an embedding model. - -``` -# The chat model is Llama3 8b chat -curl -LO https://huggingface.co/second-state/Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q5_K_M.gguf - -# The embedding model is nomic-embed-text-v1.5 -curl -LO https://huggingface.co/second-state/Nomic-embed-text-v1.5-Embedding-GGUF/resolve/main/nomic-embed-text-v1.5-f16.gguf -``` - -The embedding model is a special kind of LLM that turns sentences into vectors. The vectors can then be stored in a vector database and searched later. When the sentences are from a body of text that represents a knowledge domain, that vector database becomes our RAG knowledge base. - -## Prepare a vector database - -By default, we use Qdrant as the vector database. You can start a Qdrant instance on your server using Docker. The following command starts it in the background. - -``` -mkdir qdrant_storage -mkdir qdrant_snapshots - -nohup docker run -d -p 6333:6333 -p 6334:6334 \ - -v $(pwd)/qdrant_storage:/qdrant/storage:z \ - -v $(pwd)/qdrant_snapshots:/qdrant/snapshots:z \ - qdrant/qdrant -``` - -Delete the `default` collection if it exists. - -``` -curl -X DELETE 'http://localhost:6333/collections/default' -``` - -Next, download a knowledge base, which is in the form of a vector snapshot. For example, here is an vector snapshot -created from a guidebook for Paris. It is a 768-dimension vector collection created by the embedding model [nomic-embed-text](https://huggingface.co/second-state/Nomic-embed-text-v1.5-Embedding-GGUF), which you have already downloaded. - -``` -curl -LO https://huggingface.co/datasets/gaianet/paris/resolve/main/paris_768_nomic-embed-text-v1.5-f16.snapshot -``` - -> You can create your own vector snapshots using tools discussed in the next several chapters. - -Import the vector snapshot file into the local Qdrant database server's `default` collection. - -``` -curl -s -X POST http://localhost:6333/collections/default/snapshots/upload?priority=snapshot \ - -H 'Content-Type:multipart/form-data' \ - -F 'snapshot=@paris_768_nomic-embed-text-v1.5-f16.snapshot' -``` - -## Start the API server - -Let's start the LlamaEdge RAG API server on port 8080. By default, it connects to the local Qdrant server. - -``` -wasmedge --dir .:. \ - --nn-preload default:GGML:AUTO:Meta-Llama-3-8B-Instruct-Q5_K_M.gguf \ - --nn-preload embedding:GGML:AUTO:nomic-embed-text-v1.5-f16.gguf \ - rag-api-server.wasm -p llama-3-chat,embedding --web-ui ./chatbot-ui \ - --model-name Meta-Llama-3-8B-Instruct-Q5_K_M,nomic-embed-text-v1.5-f16 \ - --ctx-size 8192,8192 \ - --batch-size 128,8192 \ - --rag-prompt "Use the following context to answer the question.\n----------------\n" \ - --log-prompts --log-stat -``` - -The CLI arguments are self-explanatory. -Notice that those arguments are different from the [llama-api-server.wasm](https://github.com/LlamaEdge/LlamaEdge/tree/main/api-server) app. - -* The `--nn-proload` loads two models we just downloaded. The chat model is named `default` and the embedding model is named `embedding` . -* The `rag-api-server.wasm` is the API server app. It is written in Rust using LlamaEdge SDK, and is already compiled to cross-platform Wasm binary. -* The `--model-name` specifies the names of those two models so that API calls can be routed to specific models. -* The `--ctx-size` specifies the max input size for each of those two models listed in `--model-name`. -* The `--batch-size` specifies the batch processing size for each of those two models listed in `--model-name`. This parameter has a large impact on the RAM use of the API server. -* The `--rag-prompt` specifies the system prompt that introduces the context of the vector search and returns relevant context from qdrant. - -There are a few optional `--qdrant-*` arguments you could use. - -* The `--qdrant-url` is the API URL to the Qdrant server that contains the vector collection. It defaults to `http://localhost:6333`. -* The `--qdrant-collection-name` is the name of the vector collection that contains our knowledge base. It defaults to `default`. -* The `--qdrant-limit` is the maximum number of text chunks (search results) we could add to the prompt as the RAG context. It defaults to `3`. -* The `--qdrant-score-threshold` is the minimum score a search result must reach for its corresponding text chunk to be added to the RAG context. It defaults to `0.4`. - -## Chat with supplemental knowledge - -Just go to `http://localhost:8080/` from your web browser, and you will see a chatbot UI web page. You can now -ask any question about Paris and it will answer based on the Paris guidebook in the Qdrant database! - -> This is a local web server serving a local LLM with contextual knowledge from a local vector database. Nothing leaves your computer! - -Or, you can access it via the API. - -``` -curl -X POST http://localhost:8080/v1/chat/completions \ - -H 'accept: application/json' \ - -H 'Content-Type: application/json' \ - -d '{"messages":[{"role":"system", "content": "You are a helpful assistant."}, {"role":"user", "content": "Where is Paris?"}]}' - -{ - "id":"18511d0f-b760-437f-a87f-8e95645822a0", - "object":"chat.completion", - "created":1711519741, - "model":"Meta-Llama-3-8B-Instruct-Q5_K_M", - "choices":[{"index":0, - "message":{"role":"assistant","content":"Based on the provided context, Paris is located in the north-central part of France, situated along the Seine River. According to the text, people were living on the site of the present-day city by around 7600 BCE, and the modern city has spread from the island (the Île de la CitΓ©) and far beyond both banks of the Seine."}, - "finish_reason":"stop"}],"usage":{"prompt_tokens":387,"completion_tokens":80,"total_tokens":467} -} -``` - -## Next steps - -Now it is time to build your own LLM API server with long-term memory! You can start by using the same embedding model but with a different document. - -Good luck! diff --git a/docs/user-guide/llm/server-side-rag/rag-service.md b/docs/user-guide/llm/server-side-rag/rag-service.md deleted file mode 100644 index 9eae5e7..0000000 --- a/docs/user-guide/llm/server-side-rag/rag-service.md +++ /dev/null @@ -1,40 +0,0 @@ ---- -sidebar_position: 4 ---- - -# Use the API server - -The LlamaEdge RAG API server provides an API endpoint `/create/rag` that takes a text file, segments it into small chunks, turns the chunks into embeddings (i.e., vectors), and then stores the embeddings into the Qdrant database. -It provides an easy way to quick generate embeddings from a body text into a Qdrant database collection. - -## Prerequisites - -You will need to follow [this guide](quick-start) to start a Qdrant database instance and a local `llama-api-server.wasm` server. - -Delete the `default` collection if it exists. - -``` -curl -X DELETE 'http://localhost:6333/collections/default' -``` - -## Step by step example - -In this example, we will use a text document `paris.txt`, and simply submit it to the LlamaEdge API server. - -``` -curl -LO https://huggingface.co/datasets/gaianet/paris/raw/main/paris.txt - -curl -X POST http://127.0.0.1:8080/v1/create/rag -F "file=@paris.txt" -``` - -Now, the Qdrant database has a vector collection called `default` which contains embeddings from the Paris guide. You can see the stats of the vector collection as follows. - -``` -curl 'http://localhost:6333/collections/default' -``` - -Of course, the `/create/rag` API is rather primitive in chunking documents and creating embeddings. For many use cases, you should [create your own embedding vectors](text). - -> The `/create/rag` is a simple combination of [several more basic API endpoints](../../../developer-guide/create-embeddings-collection.md) provided by the API server. You can learn more about them in the developer guide. - -Have fun! diff --git a/docs/user-guide/llm/server-side-rag/text.md b/docs/user-guide/llm/server-side-rag/text.md deleted file mode 100644 index 6a8838c..0000000 --- a/docs/user-guide/llm/server-side-rag/text.md +++ /dev/null @@ -1,120 +0,0 @@ ---- -sidebar_position: 2 ---- - -# Knowledge base from a plain text file - -In this section, we will discuss how to create a vector collection snapshot from a plain text file. The -snapshot file can then be loaded by a GaiaNet node as its knowledge base. - -The text file is segmented into multiple chunks by blank lines. Each chunk is turned into a vector, and when -retrieved, added to he prompt context for the LLM. - -## Prerequisites - -Install the WasmEdge Runtime, the cross-platform LLM runtime. - -``` -curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install.sh | bash -s -- --plugins wasi_nn-ggml -``` - -Download a chat model and an embedding model. - -``` -curl -LO https://huggingface.co/gaianet/All-MiniLM-L6-v2-Embedding-GGUF/resolve/main/all-MiniLM-L6-v2-ggml-model-f16.gguf -``` - -The embedding model is a special kind of LLM that turns sentences into vectors. The vectors can then be stored in a vector database and searched later. When the sentences are from a body of text that represents a knowledge domain, that vector database becomes our RAG knowledge base. - -## Start a vector database - -By default, we use Qdrant as the vector database. You can start a Qdrant instance on your server using Docker. The following command starts it in the background. - -``` -mkdir qdrant_storage -mkdir qdrant_snapshots - -nohup docker run -d -p 6333:6333 -p 6334:6334 \ - -v $(pwd)/qdrant_storage:/qdrant/storage:z \ - -v $(pwd)/qdrant_snapshots:/qdrant/snapshots:z \ - qdrant/qdrant -``` - -## Create the vector collection snapshot - -Delete the default collection if it exists. - -``` -curl -X DELETE 'http://localhost:6333/collections/default' -``` - -Create a new collection called default. Notice that it is 384 dimensions. That is the output vector size of the embedding model `all-MiniLM-L6-v2`. If you are using a different embedding model, you should use a dimension that fits the model. - -``` -curl -X PUT 'http://localhost:6333/collections/default' \ - -H 'Content-Type: application/json' \ - --data-raw '{ - "vectors": { - "size": 384, - "distance": "Cosine", - "on_disk": true - } - }' -``` - -Download a program to chunk a document and create embeddings. - -``` -curl -LO https://github.com/GaiaNet-AI/embedding-tools/raw/main/paragraph_embed/paragraph_embed.wasm -``` - -It chunks the document based on empty lines. So, you MUST prepare your source document this way -- segment the document into sections of around 200 words with empty lines. You can check out the [Rust source code here](https://github.com/GaiaNet-AI/embedding-tools/tree/main/paragraph_embed) and modify it if you need to use a different chunking strategy. - -> The `paragraph_embed.wasm` program would NOT break up code listings even if there are empty lines with in the listing. - -Next, you can run the program by passing a collection name, vector dimension, and the source document. Make sure that Qdrant is running on your local machine. The model is preloaded under the name embedding. The wasm app then uses the embedding model to create the 384-dimension vectors from `paris_chunks.txt` and saves them into the default collection. - -``` -curl -LO https://huggingface.co/datasets/gaianet/paris/raw/main/paris_chunks.txt - -wasmedge --dir .:. \ - --nn-preload embedding:GGML:AUTO:all-MiniLM-L6-v2-ggml-model-f16.gguf \ - paragraph_embed.wasm embedding default 384 paris_chunks.txt -``` - -## More options - -You can also pass the following options to the program. - -* Using `-m` or `--maximum_context_length` to specify a context length in the CLI argument. That is to truncate and warn for each text segment that goes above the context length. -* Using `-s` or `--start_vector_id` to specify the start vector ID in the CLI argument. This will allow us to run this app multiple times on multiple documents on the same vector collection. -* Using `-c` or `--ctx_size` to specify the context size of the input. This defaults to 512. - -Example: use the `nomic-embed-text-v1.5.f16` model, which has a context length of 8192 and vector size of 768, to create embeddings for long paragraphs of text. Note that your `default` vector collection must be set up to be 768 dimensions. - -``` -curl -LO https://huggingface.co/gaianet/Nomic-embed-text-v1.5-Embedding-GGUF/resolve/main/nomic-embed-text-v1.5.f16.gguf - -wasmedge --dir .:. \ - --nn-preload embedding:GGML:AUTO:nomic-embed-text-v1.5.f16.gguf \ - paragraph_embed.wasm embedding default 768 paris.txt -c 8192 -``` - -Example: the above example but to append the London guide to the end of an existing collection starting from index 42. - -``` -wasmedge --dir .:. \ - --nn-preload embedding:GGML:AUTO:nomic-embed-text-v1.5.f16.gguf \ - paragraph_embed.wasm embedding default 768 london.txt -c 8192 -s 42 -``` - - -## Create a vector snapshot - -You can create a snapshot of the collection, which can be shared and loaded into a different Qdrant database. You can find the snapshot file in the `qdrant_snapshots` directory. - -``` -curl -X POST 'http://localhost:6333/collections/default/snapshots' -``` - -Have fun! From 8c21eaebe906676527f67cf0b8f8eee1fbbb1ee9 Mon Sep 17 00:00:00 2001 From: Michael Yuan Date: Wed, 9 Jul 2025 07:09:29 +0000 Subject: [PATCH 2/2] Fix links Signed-off-by: Michael Yuan --- docs/ai-models/index.md | 4 ++-- docs/ai-models/llamaedge-docker.md | 2 +- docs/ai-models/llm/tool-call.md | 16 +++++++--------- docs/llama-nexus/openai-api/intro.md | 26 +++----------------------- docs/llamaedge_vs_ollama.md | 5 ----- 5 files changed, 13 insertions(+), 40 deletions(-) diff --git a/docs/ai-models/index.md b/docs/ai-models/index.md index 53bbb0e..3bcf6e1 100644 --- a/docs/ai-models/index.md +++ b/docs/ai-models/index.md @@ -15,9 +15,9 @@ Explore the LLM capabilities Work with vision-language models like Llava and Qwen-VL βž” [Get Started with Multimodal](/docs/category/multimodal) -## πŸ‘οΈ Mu Embeddings +## πŸ‘οΈ Embeddings Work with embedding models for vector and semantic search -βž” [Get Started with Multimodal](/docs/category/mulembeddings) +βž” [Get Started with Multimodal](/docs/category/embeddings) ## πŸŽ™οΈ Speech to Text Run speech-to-text models like Whisper diff --git a/docs/ai-models/llamaedge-docker.md b/docs/ai-models/llamaedge-docker.md index a60e48c..b71f514 100644 --- a/docs/ai-models/llamaedge-docker.md +++ b/docs/ai-models/llamaedge-docker.md @@ -96,5 +96,5 @@ docker push secondstate/qwen-2-0.5b-allminilm-2:latest ## What's next -Use the container as a drop-in replacement for the OpenAI API for your favorite agent app or framework! [See some examples here](openai-api/intro.md). +Use the container as a drop-in replacement for the OpenAI API for your favorite agent app or framework! [See some examples here](../llama-nexus/openai-api/intro.md). diff --git a/docs/ai-models/llm/tool-call.md b/docs/ai-models/llm/tool-call.md index 8296024..08ddcf4 100644 --- a/docs/ai-models/llm/tool-call.md +++ b/docs/ai-models/llm/tool-call.md @@ -14,7 +14,7 @@ In this tutorial, we will show you a simple Python program that allows a local L ## Prerequisites -Follow [this guide](/docs/user-guide/openai-api/intro.md) to start an LlamaEdge API server. +Follow [this guide](quick-start-llm.md) to start an LlamaEdge API server. For example, we will need an open source model that is capable of tool calling. The Llama 3.1 8B model is a good choice. Let's download the model file. @@ -27,14 +27,12 @@ Then start the LlamaEdge API server for this model as follows. ``` wasmedge --dir .:. \ --nn-preload default:GGML:AUTO:Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf \ - --nn-preload embedding:GGML:AUTO:nomic-embed-text-v1.5.f16.gguf \ llama-api-server.wasm \ - --model-alias default,embedding \ - --model-name Meta-Llama-3.1-8B-Instruct-Q5_K_M,nomic-embed \ - --prompt-template llama-3-tool,embedding \ - --batch-size 128,8192 \ - --ubatch-size 128,8192 \ - --ctx-size 8192,8192 + --model-name Meta-Llama-3.1-8B-Instruct-Q5_K_M \ + --prompt-template llama-3-tool \ + --batch-size 128 \ + --ubatch-size 128 \ + --ctx-size 8192 ``` Note the `llama-3-tool` prompt template. It constructs user queries and LLM responses, including JSON messages for tool calls, into proper formats that the model is finetuned to follow. @@ -56,7 +54,7 @@ pip install -r requirements.txt Set the environment variables for the API server and model name we just set up. ``` -export OPENAI_MODEL_NAME="llama-3-groq-8b" +export OPENAI_MODEL_NAME="Meta-Llama-3.1-8B-Instruct-Q5_K_M" export OPENAI_BASE_URL="http://127.0.0.1:8080/v1" ``` diff --git a/docs/llama-nexus/openai-api/intro.md b/docs/llama-nexus/openai-api/intro.md index 6474ef9..ab053f4 100644 --- a/docs/llama-nexus/openai-api/intro.md +++ b/docs/llama-nexus/openai-api/intro.md @@ -2,35 +2,15 @@ sidebar_position: 1 --- -# Start an LlamaEdge API service +# Start the llama-nexus API service Since LlamaEdge provides an OpenAI-compatible API service, it can be a drop-in replacement for OpenAI in almost all LLM applications and frameworks. -Checkout the articles in this section for instructions and examples for how to use locally hosted LlamaEdge API services in popular LLM apps. - -## Start the API servers for multiple models - -First, you will need to start an OpenAI compatible API server. - -* Start an OpenAI compatible API server for Large Language Models (LLM) -βž” [Get Started with LLM](/docs/category/llm) - - -* Start an OpenAI compatible API server for Whisper -βž” [Get Started with Speech to Text](/docs/category/speech-to-text) - -* Start an OpenAI compatible API server for GPT-SOVITs and Piper -βž” [Get Started with Text to Speech](/docs/category/text-to-speech) - -* Start an OpenAI compatible API server for Stable Diffusion and FLUX -βž” [Get Started with Text-to-Image](/docs/category/text-to-image) - -* Start an OpenAI compatible API server for Llava and Qwen-VL -βž” [Get Started with Multimodal](/docs/category/multimodal) +You can start LlamaEdge API servers for individual AI models, and use llama-nexus to combine multiple AI models +into a single API server. ## OpenAI replacement -Now, you can ready to use this API server in OpenAI ecosystem apps as a drop-in replacement for the OpenAI API! In general, for any OpenAI tool, you could just replace the following. |Config option | Value | Note | diff --git a/docs/llamaedge_vs_ollama.md b/docs/llamaedge_vs_ollama.md index a23dc8f..2a8cc17 100644 --- a/docs/llamaedge_vs_ollama.md +++ b/docs/llamaedge_vs_ollama.md @@ -18,8 +18,3 @@ choose LlamaEdge over them? Finally, LlamaEdge is a developer platform. It provides Rust APIs and components for you to build your own applications. It enables developers to create a single compact and cross-platform binary app that can be easily deployed and orchestrated across clouds. -* The [server-side RAG](user-guide/llm/server-side-rag/quick-start) API server is built on LlamaEdge components. -* The [moxin](https://github.com/project-robius/moxin) LLM client app uses LlamaEdge as the embedded inference engine. -* The [GaiaNet](https://github.com/GaiaNet-AI/gaianet-node) project embeds LlamaEdge to run a large number of decentralized LLM agents across the web. -* The [Terminus OS](https://www.jointerminus.com/) project is a Kubernetes-based personal OS. It embeds LlamaEdge to power AI services such as local search and document QA. -