diff --git a/community/vgpu-sizing-advisor/.dockerignore b/community/ai-vws-sizing-advisor/.dockerignore
similarity index 100%
rename from community/vgpu-sizing-advisor/.dockerignore
rename to community/ai-vws-sizing-advisor/.dockerignore
diff --git a/community/vgpu-sizing-advisor/.gitattributes b/community/ai-vws-sizing-advisor/.gitattributes
similarity index 100%
rename from community/vgpu-sizing-advisor/.gitattributes
rename to community/ai-vws-sizing-advisor/.gitattributes
diff --git a/community/vgpu-sizing-advisor/.gitignore b/community/ai-vws-sizing-advisor/.gitignore
similarity index 100%
rename from community/vgpu-sizing-advisor/.gitignore
rename to community/ai-vws-sizing-advisor/.gitignore
diff --git a/community/vgpu-sizing-advisor/CHANGELOG.md b/community/ai-vws-sizing-advisor/CHANGELOG.md
similarity index 81%
rename from community/vgpu-sizing-advisor/CHANGELOG.md
rename to community/ai-vws-sizing-advisor/CHANGELOG.md
index 7b465933f..9fae02c9a 100644
--- a/community/vgpu-sizing-advisor/CHANGELOG.md
+++ b/community/ai-vws-sizing-advisor/CHANGELOG.md
@@ -3,7 +3,53 @@ All notable changes to this project will be documented in this file.
The format is based on Keep a Changelog, and this project adheres to Semantic Versioning.
-## [2.3.0] - 2025-10-20
+## [2.3] - 2026-01-08
+
+This release focuses on improved sizing recommendations, enhanced Nemotron model integration, and comprehensive documentation updates.
+
+### Added
+- **Demo Screenshots** — Added visual examples showcasing the Configuration Wizard, RAG-powered sizing recommendations, and Local Deployment verification
+- **Official Documentation Link** — Added link to [NVIDIA vGPU Docs Hub](https://docs.nvidia.com/vgpu/toolkits/sizing-advisor/latest/intro.html) in README
+
+### Changed
+- **README Overhaul** — Reorganized documentation to highlight NVIDIA Nemotron models
+ - Llama-3.3-Nemotron-Super-49B powers the RAG backend
+ - Nemotron-3 Nano 30B (FP8) as default for workload sizing
+ - New Demo section with screenshots demonstrating key features
+
+- **Sizing Recommendation Improvements**
+ - Enhanced 95% usable capacity rule for profile selection (5% reserved for system overhead)
+ - Improved profile selection logic: picks smallest profile where (profile × 0.95) >= workload
+ - Better handling of edge cases near profile boundaries
+
+- **GPU Passthrough Logic**
+ - Automatic passthrough recommendation when workload exceeds max single vGPU profile
+ - Clearer passthrough examples in RAG context (e.g., 92GB on BSE → 2× BSE GPU passthrough)
+ - Calculator now returns `vgpu_profile: null` with multi-GPU passthrough recommendation
+
+- **vLLM Local Deployment**
+ - Updated to vLLM v0.12.0 for proper NemotronH (hybrid Mamba-Transformer) architecture support
+ - Improved GPU memory utilization calculations for local testing
+ - Better max-model-len auto-detection (only set when explicitly specified)
+
+- **Chat Improvements**
+ - Enhanced conversational mode with vGPU configuration context
+ - Better model extraction from sizing responses for follow-up questions
+ - Improved context handling for RAG vs inference workload discussions
+
+### Improved
+- **Nemotron Model Integration**
+ - Default model changed to Nemotron-3 Nano 30B FP8 in configuration wizard
+ - Nemotron thinking prompt support for enhanced reasoning
+ - Better model matching for Nemotron variants in calculator
+
+## [2.2] - 2025-11-04
+
+### Changed
+- Updated branding from "vGPU Sizing Advisor" to "AI vWS Sizing Advisor" throughout UI and documentation
+- Improved user-facing verbiage for better clarity and consistency
+
+## [2.1] - 2025-10-20
This release focuses on local deployment improvements, enhanced workload differentiation, and improved user experience with advanced configuration options.
@@ -52,7 +98,7 @@ This release focuses on local deployment improvements, enhanced workload differe
- Better visual feedback and status indicators
- Improved configuration wizard flow
-## [2.2.0] - 2025-10-13
+## [2.0] - 2025-10-13
This release focuses on the AI vWS Sizing Advisor with enhanced deployment capabilities, improved user experience, and zero external dependencies for SSH operations.
@@ -137,8 +183,7 @@ This release focuses on the AI vWS Sizing Advisor with enhanced deployment capab
- SSH key-based authentication (more secure than passwords)
- Automatic key generation with proper permissions (700/600)
-## [2.1.0] - 2025-05-13
-
+## [1.2] - 2025-05-13
This release reduces overall GPU requirement for the deployment of the blueprint. It also improves the performance and stability for both docker and helm based deployments.
@@ -168,7 +213,7 @@ This release reduces overall GPU requirement for the deployment of the blueprint
A detailed guide is available [here](./docs/migration_guide.md) for easing developers experience, while migrating from older versions.
-## [2.0.0] - 2025-03-18
+## [1.1] - 2025-03-18
This release adds support for multimodal documents using [Nvidia Ingest](https://github.com/NVIDIA/nv-ingest) including support for parsing PDFs, Word and PowerPoint documents. It also significantly improves accuracy and perf considerations by refactoring the APIs, architecture as well as adds a new developer friendly UI.
@@ -202,7 +247,7 @@ This release adds support for multimodal documents using [Nvidia Ingest](https:/
A detailed guide is available [here](./docs/migration_guide.md) for easing developers experience, while migrating from older versions.
-## [1.0.0] - 2025-01-15
+## [1.0] - 2025-01-15
### Added
diff --git a/community/vgpu-sizing-advisor/README.md b/community/ai-vws-sizing-advisor/README.md
similarity index 56%
rename from community/vgpu-sizing-advisor/README.md
rename to community/ai-vws-sizing-advisor/README.md
index 3af5ea429..b12830350 100644
--- a/community/vgpu-sizing-advisor/README.md
+++ b/community/ai-vws-sizing-advisor/README.md
@@ -1,18 +1,67 @@
-# vGPU Sizing Advisor for AI vWS
+# AI vWS Sizing Advisor
+
+
+
+
+
+
+ RAG-powered vGPU sizing recommendations for AI Virtual Workstations
+ Powered by NVIDIA NeMo™ and Nemotron models
+
+
+---
## Overview
-vGPU Sizing Advisor is a RAG-powered tool that helps you determine the optimal NVIDIA vGPU configuration for AI workloads on NVIDIA AI Virtual Workstation (AI vWS). Using NVIDIA vGPU documentation and best practices, it provides tailored recommendations for optimal performance and resource efficiency.
+AI vWS Sizing Advisor is a RAG-powered tool that helps you determine the optimal NVIDIA vGPU sizing configuration for AI workloads on NVIDIA AI Virtual Workstation (AI vWS). Using NVIDIA vGPU documentation and best practices, it provides tailored recommendations for optimal performance and resource efficiency.
+
+### Powered by NVIDIA Nemotron
+
+This tool leverages **NVIDIA Nemotron models** for intelligent sizing recommendations:
+
+- **[Llama-3.3-Nemotron-Super-49B](https://build.nvidia.com/nvidia/llama-3_3-nemotron-super-49b-v1)** — Powers the RAG backend for intelligent conversational sizing guidance
+- **[Nemotron-3 Nano 30B](https://build.nvidia.com/nvidia/nvidia-nemotron-3-nano-30b-a3b-fp8)** — Default model for workload sizing calculations (FP8 optimized)
+
+### Key Capabilities
Enter your workload requirements and receive validated recommendations including:
-- **vGPU Profile** - Recommended profile (e.g., L40S-24Q) based on your workload
-- **Resource Requirements** - vCPUs, GPU memory, system RAM needed
-- **Performance Estimates** - Expected latency, throughput, and time to first token
-- **Live Testing** - Instantly deploy and validate your configuration locally using vLLM containers
+- **vGPU Profile** — Recommended profile (e.g., L40S-24Q) based on your workload
+- **Resource Requirements** — vCPUs, GPU memory, system RAM needed
+- **Performance Estimates** — Expected latency, throughput, and time to first token
+- **Live Testing** — Instantly deploy and validate your configuration locally using vLLM containers
The tool differentiates between RAG and inference workloads by accounting for embedding vectors and database overhead. It intelligently suggests GPU passthrough when jobs exceed standard vGPU profile limits.
+---
+
+## Demo
+
+### Configuration Wizard
+
+Configure your workload parameters including model selection, GPU type, quantization, and token sizes:
+
+
+
+
+
+### Local Deployment Verification
+
+Validate your configuration by deploying a vLLM container locally and comparing actual GPU memory usage against estimates:
+
+
+
+
+
+---
+
## Prerequisites
### Hardware
@@ -44,15 +93,17 @@ docker run --rm --gpus all nvidia/cuda:12.4.0-base-ubuntu22.04 nvidia-smi
> **Note:** Docker must be at `/usr/bin/docker` (verified in `deploy/compose/docker-compose-rag-server.yaml`). User must be in docker group or have socket permissions.
### API Keys
-- **NVIDIA Build API Key** (Required) - [Get your key](https://build.nvidia.com/settings/api-keys)
-- **HuggingFace Token** (Optional) - [Create token](https://huggingface.co/settings/tokens) for gated models
+- **NVIDIA Build API Key** (Required) — [Get your key](https://build.nvidia.com/settings/api-keys)
+- **HuggingFace Token** (Optional) — [Create token](https://huggingface.co/settings/tokens) for gated models
+
+---
## Deployment
**1. Clone and navigate:**
```bash
git clone https://github.com/NVIDIA/GenerativeAIExamples.git
-cd GenerativeAIExamples/community/vgpu-sizing-advisor
+cd GenerativeAIExamples/community/ai-vws-sizing-advisor
```
**2. Set NGC API key:**
@@ -74,28 +125,32 @@ npm install
npm run dev
```
+---
+
## Usage
-2. **Select Workload Type:** RAG or Inference
+1. **Select Workload Type:** RAG or Inference
-3. **Enter Parameters:**
- - Model name (e.g., `meta-llama/Llama-2-7b-chat-hf`)
+2. **Enter Parameters:**
+ - Model name (default: **Nemotron-3 Nano 30B FP8**)
- GPU type
- Prompt size (input tokens)
- Response size (output tokens)
- - Quantization (FP16, INT8, INT4)
+ - Quantization (FP16, FP8, INT8, INT4)
- For RAG: Embedding model and vector dimensions
-4. **View Recommendations:**
+3. **View Recommendations:**
- Recommended vGPU profiles
- Resource requirements (vCPUs, RAM, GPU memory)
- Performance estimates
-5. **Test Locally** (optional):
+4. **Test Locally** (optional):
- Run local inference with a containerized vLLM server
- View performance metrics
- Compare actual results versus suggested profile configuration
+---
+
## Management Commands
```bash
@@ -120,6 +175,8 @@ The stop script automatically performs Docker cleanup operations:
- Optionally removes dangling images (`--cleanup-images`)
- Optionally removes all data volumes (`--volumes`)
+---
+
## Adding Documents to RAG Context
The tool includes NVIDIA vGPU documentation by default. To add your own:
@@ -134,8 +191,7 @@ curl -X POST -F "file=@./vgpu_docs/your-document.pdf" http://localhost:8082/v1/i
**Supported formats:** PDF, TXT, DOCX, HTML, PPTX
-
-
+---
## License
@@ -145,6 +201,6 @@ Models governed by [NVIDIA AI Foundation Models Community License](https://docs.
---
-**Version:** 2.3.0 (October 2025) - See [CHANGELOG.md](./CHANGELOG.md)
+**Version:** 2.3 (January 2026) — See [CHANGELOG.md](./CHANGELOG.md)
-**Support:** [GitHub Issues](https://github.com/NVIDIA/GenerativeAIExamples/issues) | [NVIDIA Forums](https://forums.developer.nvidia.com/)
\ No newline at end of file
+**Support:** [GitHub Issues](https://github.com/NVIDIA/GenerativeAIExamples/issues) | [NVIDIA Forums](https://forums.developer.nvidia.com/) | [Official Docs](https://docs.nvidia.com/vgpu/toolkits/sizing-advisor/latest/intro.html)
\ No newline at end of file
diff --git a/community/vgpu-sizing-advisor/deploy/compose/.env b/community/ai-vws-sizing-advisor/deploy/compose/.env
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/compose/.env
rename to community/ai-vws-sizing-advisor/deploy/compose/.env
diff --git a/community/vgpu-sizing-advisor/deploy/compose/accuracy_profile.env b/community/ai-vws-sizing-advisor/deploy/compose/accuracy_profile.env
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/compose/accuracy_profile.env
rename to community/ai-vws-sizing-advisor/deploy/compose/accuracy_profile.env
diff --git a/community/vgpu-sizing-advisor/deploy/compose/docker-compose-bootstrap.yaml b/community/ai-vws-sizing-advisor/deploy/compose/docker-compose-bootstrap.yaml
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/compose/docker-compose-bootstrap.yaml
rename to community/ai-vws-sizing-advisor/deploy/compose/docker-compose-bootstrap.yaml
diff --git a/community/vgpu-sizing-advisor/deploy/compose/docker-compose-ingestor-server.yaml b/community/ai-vws-sizing-advisor/deploy/compose/docker-compose-ingestor-server.yaml
similarity index 88%
rename from community/vgpu-sizing-advisor/deploy/compose/docker-compose-ingestor-server.yaml
rename to community/ai-vws-sizing-advisor/deploy/compose/docker-compose-ingestor-server.yaml
index 24eb910ba..a163c3680 100644
--- a/community/vgpu-sizing-advisor/deploy/compose/docker-compose-ingestor-server.yaml
+++ b/community/ai-vws-sizing-advisor/deploy/compose/docker-compose-ingestor-server.yaml
@@ -1,3 +1,11 @@
+# ============================================================================
+# CENTRALIZED MODEL CONFIGURATION
+# Change these values to use different models throughout the application
+# ============================================================================
+x-model-config:
+ # Embedding Model Configuration
+ embedding-model: &embedding-model "nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1"
+
services:
# Main ingestor server which is responsible for ingestion
@@ -38,10 +46,14 @@ services:
NGC_API_KEY: ${NGC_API_KEY:?"NGC_API_KEY is required"}
##===Embedding Model specific configurations===
+ # Model name - pulls from centralized config at top of file (can be overridden by env var)
+ APP_EMBEDDINGS_MODELNAME: *embedding-model
# url on which embedding model is hosted. If "", Nvidia hosted API is used
- APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-embedding-ms:8000"}
- APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/nv-embedqa-mistral-7b-v2}
- APP_EMBEDDINGS_DIMENSIONS: ${APP_EMBEDDINGS_DIMENSIONS:-2048}
+ APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL:-"nemoretriever-embedding-ms:8000"}
+ # Embedding dimensions - IMPORTANT: Must match your embedding model!
+ # nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1: 4096
+ # nvidia/nv-embedqa-mistral-7b-v2: 2048
+ APP_EMBEDDINGS_DIMENSIONS: ${APP_EMBEDDINGS_DIMENSIONS:-4096}
##===NV-Ingest Connection Configurations=======
APP_NVINGEST_MESSAGECLIENTHOSTNAME: ${APP_NVINGEST_MESSAGECLIENTHOSTNAME:-"nv-ingest-ms-runtime"}
@@ -115,9 +127,10 @@ services:
- AUDIO_INFER_PROTOCOL=grpc
- CUDA_VISIBLE_DEVICES=0
- MAX_INGEST_PROCESS_WORKERS=${MAX_INGEST_PROCESS_WORKERS:-16}
- - EMBEDDING_NIM_MODEL_NAME=${EMBEDDING_NIM_MODEL_NAME:-${APP_EMBEDDINGS_MODELNAME:-nvidia/nv-embedqa-7b-v2}}
+ # Embedding model - uses APP_EMBEDDINGS_MODELNAME which pulls from centralized config
+ - EMBEDDING_NIM_MODEL_NAME=${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1}
# Incase of self-hosted embedding model, use the endpoint url as - https://integrate.api.nvidia.com/v1
- - EMBEDDING_NIM_ENDPOINT=${EMBEDDING_NIM_ENDPOINT:-${APP_EMBEDDINGS_SERVERURL-http://nemoretriever-embedding-ms:8000/v1}}
+ - EMBEDDING_NIM_ENDPOINT=${EMBEDDING_NIM_ENDPOINT:-http://nemoretriever-embedding-ms:8000/v1}
- INGEST_LOG_LEVEL=DEFAULT
- INGEST_EDGE_BUFFER_SIZE=64
# Message client for development
diff --git a/community/vgpu-sizing-advisor/deploy/compose/docker-compose-nemo-guardrails.yaml b/community/ai-vws-sizing-advisor/deploy/compose/docker-compose-nemo-guardrails.yaml
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/compose/docker-compose-nemo-guardrails.yaml
rename to community/ai-vws-sizing-advisor/deploy/compose/docker-compose-nemo-guardrails.yaml
diff --git a/community/vgpu-sizing-advisor/deploy/compose/docker-compose-rag-server.yaml b/community/ai-vws-sizing-advisor/deploy/compose/docker-compose-rag-server.yaml
similarity index 82%
rename from community/vgpu-sizing-advisor/deploy/compose/docker-compose-rag-server.yaml
rename to community/ai-vws-sizing-advisor/deploy/compose/docker-compose-rag-server.yaml
index 7beba493d..69bfdd194 100644
--- a/community/vgpu-sizing-advisor/deploy/compose/docker-compose-rag-server.yaml
+++ b/community/ai-vws-sizing-advisor/deploy/compose/docker-compose-rag-server.yaml
@@ -1,3 +1,14 @@
+# ============================================================================
+# CENTRALIZED MODEL CONFIGURATION
+# Change these values to use different models throughout the application
+# ============================================================================
+x-model-config:
+ # Chat/LLM Model Configuration
+ llm-model: &llm-model "nvidia/llama-3.3-nemotron-super-49b-v1"
+
+ # Embedding Model Configuration
+ embedding-model: &embedding-model "nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1"
+
services:
# Main orchestrator server which stiches together all calls to different services to fulfill the user request
@@ -35,25 +46,16 @@ services:
VECTOR_DB_TOPK: ${VECTOR_DB_TOPK:-100}
##===LLM Model specific configurations===
- APP_LLM_MODELNAME: ${APP_LLM_MODELNAME:-"meta/llama-3.1-8b-instruct"}
+ # Model name - pulls from centralized config at top of file (can be overridden by env var)
+ APP_LLM_MODELNAME: *llm-model
# url on which llm model is hosted. If "", Nvidia hosted API is used
- APP_LLM_SERVERURL: ${APP_LLM_SERVERURL-""}
-
- ##===Query Rewriter Model specific configurations===
- APP_QUERYREWRITER_MODELNAME: ${APP_QUERYREWRITER_MODELNAME:-"meta/llama-3.1-8b-instruct"}
- # url on which query rewriter model is hosted. If "", Nvidia hosted API is used
- APP_QUERYREWRITER_SERVERURL: ${APP_QUERYREWRITER_SERVERURL-"nim-llm-llama-8b-ms:8000"}
+ APP_LLM_SERVERURL: ${APP_LLM_SERVERURL:-""}
##===Embedding Model specific configurations===
+ # Model name - pulls from centralized config at top of file (can be overridden by env var)
+ APP_EMBEDDINGS_MODELNAME: *embedding-model
# url on which embedding model is hosted. If "", Nvidia hosted API is used
- APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-""}
- APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/nv-embedqa-mistral-7b-v2}
-
- ##===Reranking Model specific configurations===
- # url on which ranking model is hosted. If "", Nvidia hosted API is used
- APP_RANKING_SERVERURL: ${APP_RANKING_SERVERURL-""}
- APP_RANKING_MODELNAME: ${APP_RANKING_MODELNAME:-nv-rerank-qa-mistral-4b:1}
- ENABLE_RERANKER: ${ENABLE_RERANKER:-True}
+ APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL:-""}
NVIDIA_API_KEY: ${NGC_API_KEY:?"NGC_API_KEY is required"}
@@ -65,7 +67,7 @@ services:
# enable multi-turn conversation in the rag chain - this controls conversation history usage
# while doing query rewriting and in LLM prompt
- ENABLE_MULTITURN: ${ENABLE_MULTITURN:-False}
+ ENABLE_MULTITURN: ${ENABLE_MULTITURN:-True}
# enable query rewriting for multiturn conversation in the rag chain.
# This will improve accuracy of the retrieiver pipeline but increase latency due to an additional LLM call
@@ -139,10 +141,10 @@ services:
context: ../../frontend
dockerfile: ./Dockerfile
args:
- # Model name for LLM
- NEXT_PUBLIC_MODEL_NAME: ${APP_LLM_MODELNAME:-meta/llama-3.1-8b-instruct}
- # Model name for embeddings
- NEXT_PUBLIC_EMBEDDING_MODEL: ${APP_EMBEDDINGS_MODELNAME:-nvidia/nv-embedqa-mistral-7b-v2}
+ # Model name for LLM - pulls from centralized config at top of file
+ NEXT_PUBLIC_MODEL_NAME: *llm-model
+ # Model name for embeddings - pulls from centralized config at top of file
+ NEXT_PUBLIC_EMBEDDING_MODEL: *embedding-model
# Model name for reranking
NEXT_PUBLIC_RERANKER_MODEL: ${APP_RANKING_MODELNAME:-nv-rerank-qa-mistral-4b:1}
# URL for rag server container
diff --git a/community/ai-vws-sizing-advisor/deploy/compose/model_config.env b/community/ai-vws-sizing-advisor/deploy/compose/model_config.env
new file mode 100644
index 000000000..fa46dc00a
--- /dev/null
+++ b/community/ai-vws-sizing-advisor/deploy/compose/model_config.env
@@ -0,0 +1,82 @@
+# ============================================================================
+# CENTRALIZED MODEL CONFIGURATION
+# ============================================================================
+# This file centralizes all model configurations for the RAG system.
+# Source this file or set these environment variables to change models.
+#
+# Usage:
+# source model_config.env
+# docker compose -f docker-compose-rag-server.yaml up
+#
+# ============================================================================
+
+# ----------------------------------------------------------------------------
+# CHAT/LLM MODEL CONFIGURATION
+# ----------------------------------------------------------------------------
+# The main language model used for generating responses
+# Default: nvidia/llama-3.3-nemotron-super-49b-v1
+#
+# Other options:
+# - meta/llama-3.1-405b-instruct
+# - meta/llama-3.1-70b-instruct
+# - meta/llama-3.1-8b-instruct
+# - mistralai/mixtral-8x22b-instruct-v0.1
+#
+export APP_LLM_MODELNAME="nvidia/llama-3.3-nemotron-super-49b-v1"
+
+# LLM Server URL (leave empty "" to use NVIDIA hosted API)
+export APP_LLM_SERVERURL=""
+
+# ----------------------------------------------------------------------------
+# EMBEDDING MODEL CONFIGURATION
+# ----------------------------------------------------------------------------
+# The embedding model used for vectorizing documents and queries
+# Default: nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1
+#
+# Other options:
+# - nvidia/nv-embedqa-mistral-7b-v2
+# - nvidia/nv-embed-v2
+# - nvidia/llama-3.2-nv-embedqa-1b-v2
+#
+export APP_EMBEDDINGS_MODELNAME="nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1"
+
+# Embedding Server URL (leave empty "" to use NVIDIA hosted API, or set to self-hosted)
+# Example for self-hosted: "nemoretriever-embedding-ms:8000"
+export APP_EMBEDDINGS_SERVERURL=""
+
+# Embedding dimensions (adjust based on your embedding model)
+# IMPORTANT: This MUST match your chosen embedding model!
+# - nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1: 4096 (current default)
+# - nvidia/nv-embedqa-mistral-7b-v2: 2048
+# - nvidia/nv-embed-v2: 4096
+export APP_EMBEDDINGS_DIMENSIONS="4096"
+
+# ----------------------------------------------------------------------------
+# REFLECTION MODEL CONFIGURATION (for response quality checking)
+# ----------------------------------------------------------------------------
+# Model used for reflection/self-checking if ENABLE_REFLECTION=true
+export REFLECTION_LLM="mistralai/mixtral-8x22b-instruct-v0.1"
+export REFLECTION_LLM_SERVERURL="nim-llm-mixtral-8x22b:8000"
+
+# ----------------------------------------------------------------------------
+# CAPTION MODEL CONFIGURATION (for image/chart understanding)
+# ----------------------------------------------------------------------------
+# Model used for generating captions for images, charts, and tables
+export APP_NVINGEST_CAPTIONMODELNAME="meta/llama-3.2-11b-vision-instruct"
+export APP_NVINGEST_CAPTIONENDPOINTURL="http://vlm-ms:8000/v1/chat/completions"
+export VLM_CAPTION_MODEL_NAME="meta/llama-3.2-11b-vision-instruct"
+export VLM_CAPTION_ENDPOINT="http://vlm-ms:8000/v1/chat/completions"
+
+# ----------------------------------------------------------------------------
+# ADDITIONAL NOTES
+# ----------------------------------------------------------------------------
+# 1. After changing models, you may need to rebuild containers:
+# docker compose -f docker-compose-rag-server.yaml build --no-cache rag-playground
+#
+# 2. For self-hosted models, make sure the corresponding NIM services are running
+#
+# 3. The embedding dimensions must match your chosen embedding model
+#
+# 4. When switching between hosted and self-hosted, update both the model name
+# and the server URL accordingly
+
diff --git a/community/vgpu-sizing-advisor/deploy/compose/nemoguardrails/config-store/config.yaml b/community/ai-vws-sizing-advisor/deploy/compose/nemoguardrails/config-store/config.yaml
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/compose/nemoguardrails/config-store/config.yaml
rename to community/ai-vws-sizing-advisor/deploy/compose/nemoguardrails/config-store/config.yaml
diff --git a/community/vgpu-sizing-advisor/deploy/compose/nemoguardrails/config-store/nemoguard/config.yml b/community/ai-vws-sizing-advisor/deploy/compose/nemoguardrails/config-store/nemoguard/config.yml
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/compose/nemoguardrails/config-store/nemoguard/config.yml
rename to community/ai-vws-sizing-advisor/deploy/compose/nemoguardrails/config-store/nemoguard/config.yml
diff --git a/community/vgpu-sizing-advisor/deploy/compose/nemoguardrails/config-store/nemoguard/prompts.yml b/community/ai-vws-sizing-advisor/deploy/compose/nemoguardrails/config-store/nemoguard/prompts.yml
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/compose/nemoguardrails/config-store/nemoguard/prompts.yml
rename to community/ai-vws-sizing-advisor/deploy/compose/nemoguardrails/config-store/nemoguard/prompts.yml
diff --git a/community/vgpu-sizing-advisor/deploy/compose/nemoguardrails/config-store/nemoguard_cloud/config.yml b/community/ai-vws-sizing-advisor/deploy/compose/nemoguardrails/config-store/nemoguard_cloud/config.yml
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/compose/nemoguardrails/config-store/nemoguard_cloud/config.yml
rename to community/ai-vws-sizing-advisor/deploy/compose/nemoguardrails/config-store/nemoguard_cloud/config.yml
diff --git a/community/vgpu-sizing-advisor/deploy/compose/nemoguardrails/config-store/nemoguard_cloud/prompts.yml b/community/ai-vws-sizing-advisor/deploy/compose/nemoguardrails/config-store/nemoguard_cloud/prompts.yml
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/compose/nemoguardrails/config-store/nemoguard_cloud/prompts.yml
rename to community/ai-vws-sizing-advisor/deploy/compose/nemoguardrails/config-store/nemoguard_cloud/prompts.yml
diff --git a/community/vgpu-sizing-advisor/deploy/compose/nims.yaml b/community/ai-vws-sizing-advisor/deploy/compose/nims.yaml
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/compose/nims.yaml
rename to community/ai-vws-sizing-advisor/deploy/compose/nims.yaml
diff --git a/community/vgpu-sizing-advisor/deploy/compose/observability.yaml b/community/ai-vws-sizing-advisor/deploy/compose/observability.yaml
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/compose/observability.yaml
rename to community/ai-vws-sizing-advisor/deploy/compose/observability.yaml
diff --git a/community/vgpu-sizing-advisor/deploy/compose/perf_profile.env b/community/ai-vws-sizing-advisor/deploy/compose/perf_profile.env
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/compose/perf_profile.env
rename to community/ai-vws-sizing-advisor/deploy/compose/perf_profile.env
diff --git a/community/vgpu-sizing-advisor/deploy/compose/vectordb.yaml b/community/ai-vws-sizing-advisor/deploy/compose/vectordb.yaml
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/compose/vectordb.yaml
rename to community/ai-vws-sizing-advisor/deploy/compose/vectordb.yaml
diff --git a/community/vgpu-sizing-advisor/deploy/compose/vgpu_bootstrap.env b/community/ai-vws-sizing-advisor/deploy/compose/vgpu_bootstrap.env
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/compose/vgpu_bootstrap.env
rename to community/ai-vws-sizing-advisor/deploy/compose/vgpu_bootstrap.env
diff --git a/community/vgpu-sizing-advisor/deploy/config/otel-collector-config.yaml b/community/ai-vws-sizing-advisor/deploy/config/otel-collector-config.yaml
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/config/otel-collector-config.yaml
rename to community/ai-vws-sizing-advisor/deploy/config/otel-collector-config.yaml
diff --git a/community/vgpu-sizing-advisor/deploy/config/prometheus.yaml b/community/ai-vws-sizing-advisor/deploy/config/prometheus.yaml
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/config/prometheus.yaml
rename to community/ai-vws-sizing-advisor/deploy/config/prometheus.yaml
diff --git a/community/vgpu-sizing-advisor/deploy/config/rag-metrics-dashboard.json b/community/ai-vws-sizing-advisor/deploy/config/rag-metrics-dashboard.json
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/config/rag-metrics-dashboard.json
rename to community/ai-vws-sizing-advisor/deploy/config/rag-metrics-dashboard.json
diff --git a/community/vgpu-sizing-advisor/deploy/helm/rag-server/Chart.lock b/community/ai-vws-sizing-advisor/deploy/helm/rag-server/Chart.lock
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/helm/rag-server/Chart.lock
rename to community/ai-vws-sizing-advisor/deploy/helm/rag-server/Chart.lock
diff --git a/community/vgpu-sizing-advisor/deploy/helm/rag-server/Chart.yaml b/community/ai-vws-sizing-advisor/deploy/helm/rag-server/Chart.yaml
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/helm/rag-server/Chart.yaml
rename to community/ai-vws-sizing-advisor/deploy/helm/rag-server/Chart.yaml
diff --git a/community/vgpu-sizing-advisor/deploy/helm/rag-server/LICENSE b/community/ai-vws-sizing-advisor/deploy/helm/rag-server/LICENSE
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/helm/rag-server/LICENSE
rename to community/ai-vws-sizing-advisor/deploy/helm/rag-server/LICENSE
diff --git a/community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/frontend/.helmignore b/community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/frontend/.helmignore
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/frontend/.helmignore
rename to community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/frontend/.helmignore
diff --git a/community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/frontend/Chart.yaml b/community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/frontend/Chart.yaml
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/frontend/Chart.yaml
rename to community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/frontend/Chart.yaml
diff --git a/community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/frontend/templates/NOTES.txt b/community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/frontend/templates/NOTES.txt
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/frontend/templates/NOTES.txt
rename to community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/frontend/templates/NOTES.txt
diff --git a/community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/frontend/templates/_helpers.tpl b/community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/frontend/templates/_helpers.tpl
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/frontend/templates/_helpers.tpl
rename to community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/frontend/templates/_helpers.tpl
diff --git a/community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/frontend/templates/deployment.yaml b/community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/frontend/templates/deployment.yaml
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/frontend/templates/deployment.yaml
rename to community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/frontend/templates/deployment.yaml
diff --git a/community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/frontend/templates/hpa.yaml b/community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/frontend/templates/hpa.yaml
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/frontend/templates/hpa.yaml
rename to community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/frontend/templates/hpa.yaml
diff --git a/community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/frontend/templates/ingress.yaml b/community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/frontend/templates/ingress.yaml
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/frontend/templates/ingress.yaml
rename to community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/frontend/templates/ingress.yaml
diff --git a/community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/frontend/templates/secrets.yaml b/community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/frontend/templates/secrets.yaml
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/frontend/templates/secrets.yaml
rename to community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/frontend/templates/secrets.yaml
diff --git a/community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/frontend/templates/service.yaml b/community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/frontend/templates/service.yaml
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/frontend/templates/service.yaml
rename to community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/frontend/templates/service.yaml
diff --git a/community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/frontend/templates/serviceaccount.yaml b/community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/frontend/templates/serviceaccount.yaml
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/frontend/templates/serviceaccount.yaml
rename to community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/frontend/templates/serviceaccount.yaml
diff --git a/community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/frontend/values.yaml b/community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/frontend/values.yaml
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/frontend/values.yaml
rename to community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/frontend/values.yaml
diff --git a/community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/ingestor-server/Chart.lock b/community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/ingestor-server/Chart.lock
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/ingestor-server/Chart.lock
rename to community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/ingestor-server/Chart.lock
diff --git a/community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/ingestor-server/Chart.yaml b/community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/ingestor-server/Chart.yaml
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/ingestor-server/Chart.yaml
rename to community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/ingestor-server/Chart.yaml
diff --git a/community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/ingestor-server/templates/_helpers.tpl b/community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/ingestor-server/templates/_helpers.tpl
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/ingestor-server/templates/_helpers.tpl
rename to community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/ingestor-server/templates/_helpers.tpl
diff --git a/community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/ingestor-server/templates/deployment.yaml b/community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/ingestor-server/templates/deployment.yaml
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/ingestor-server/templates/deployment.yaml
rename to community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/ingestor-server/templates/deployment.yaml
diff --git a/community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/ingestor-server/templates/secrets.yaml b/community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/ingestor-server/templates/secrets.yaml
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/ingestor-server/templates/secrets.yaml
rename to community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/ingestor-server/templates/secrets.yaml
diff --git a/community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/ingestor-server/templates/service.yaml b/community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/ingestor-server/templates/service.yaml
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/ingestor-server/templates/service.yaml
rename to community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/ingestor-server/templates/service.yaml
diff --git a/community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/ingestor-server/values.yaml b/community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/ingestor-server/values.yaml
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/helm/rag-server/charts/ingestor-server/values.yaml
rename to community/ai-vws-sizing-advisor/deploy/helm/rag-server/charts/ingestor-server/values.yaml
diff --git a/community/vgpu-sizing-advisor/deploy/helm/rag-server/files/prompt.yaml b/community/ai-vws-sizing-advisor/deploy/helm/rag-server/files/prompt.yaml
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/helm/rag-server/files/prompt.yaml
rename to community/ai-vws-sizing-advisor/deploy/helm/rag-server/files/prompt.yaml
diff --git a/community/vgpu-sizing-advisor/deploy/helm/rag-server/templates/_helpers.tpl b/community/ai-vws-sizing-advisor/deploy/helm/rag-server/templates/_helpers.tpl
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/helm/rag-server/templates/_helpers.tpl
rename to community/ai-vws-sizing-advisor/deploy/helm/rag-server/templates/_helpers.tpl
diff --git a/community/vgpu-sizing-advisor/deploy/helm/rag-server/templates/configmap.yaml b/community/ai-vws-sizing-advisor/deploy/helm/rag-server/templates/configmap.yaml
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/helm/rag-server/templates/configmap.yaml
rename to community/ai-vws-sizing-advisor/deploy/helm/rag-server/templates/configmap.yaml
diff --git a/community/vgpu-sizing-advisor/deploy/helm/rag-server/templates/deployment.yaml b/community/ai-vws-sizing-advisor/deploy/helm/rag-server/templates/deployment.yaml
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/helm/rag-server/templates/deployment.yaml
rename to community/ai-vws-sizing-advisor/deploy/helm/rag-server/templates/deployment.yaml
diff --git a/community/vgpu-sizing-advisor/deploy/helm/rag-server/templates/secrets.yaml b/community/ai-vws-sizing-advisor/deploy/helm/rag-server/templates/secrets.yaml
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/helm/rag-server/templates/secrets.yaml
rename to community/ai-vws-sizing-advisor/deploy/helm/rag-server/templates/secrets.yaml
diff --git a/community/vgpu-sizing-advisor/deploy/helm/rag-server/templates/service.yaml b/community/ai-vws-sizing-advisor/deploy/helm/rag-server/templates/service.yaml
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/helm/rag-server/templates/service.yaml
rename to community/ai-vws-sizing-advisor/deploy/helm/rag-server/templates/service.yaml
diff --git a/community/vgpu-sizing-advisor/deploy/helm/rag-server/templates/servicemonitor.yaml b/community/ai-vws-sizing-advisor/deploy/helm/rag-server/templates/servicemonitor.yaml
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/helm/rag-server/templates/servicemonitor.yaml
rename to community/ai-vws-sizing-advisor/deploy/helm/rag-server/templates/servicemonitor.yaml
diff --git a/community/vgpu-sizing-advisor/deploy/helm/rag-server/values.yaml b/community/ai-vws-sizing-advisor/deploy/helm/rag-server/values.yaml
similarity index 100%
rename from community/vgpu-sizing-advisor/deploy/helm/rag-server/values.yaml
rename to community/ai-vws-sizing-advisor/deploy/helm/rag-server/values.yaml
diff --git a/community/ai-vws-sizing-advisor/deployment_examples/configuration_wizard.png b/community/ai-vws-sizing-advisor/deployment_examples/configuration_wizard.png
new file mode 100644
index 000000000..8734e4abe
Binary files /dev/null and b/community/ai-vws-sizing-advisor/deployment_examples/configuration_wizard.png differ
diff --git a/community/ai-vws-sizing-advisor/deployment_examples/example_rag_config.png b/community/ai-vws-sizing-advisor/deployment_examples/example_rag_config.png
new file mode 100644
index 000000000..0625c77bf
Binary files /dev/null and b/community/ai-vws-sizing-advisor/deployment_examples/example_rag_config.png differ
diff --git a/community/ai-vws-sizing-advisor/deployment_examples/local_deployment.png b/community/ai-vws-sizing-advisor/deployment_examples/local_deployment.png
new file mode 100644
index 000000000..d54b7eff6
Binary files /dev/null and b/community/ai-vws-sizing-advisor/deployment_examples/local_deployment.png differ
diff --git a/community/vgpu-sizing-advisor/frontend/.env.example b/community/ai-vws-sizing-advisor/frontend/.env.example
similarity index 100%
rename from community/vgpu-sizing-advisor/frontend/.env.example
rename to community/ai-vws-sizing-advisor/frontend/.env.example
diff --git a/community/vgpu-sizing-advisor/frontend/.gitignore b/community/ai-vws-sizing-advisor/frontend/.gitignore
similarity index 100%
rename from community/vgpu-sizing-advisor/frontend/.gitignore
rename to community/ai-vws-sizing-advisor/frontend/.gitignore
diff --git a/community/vgpu-sizing-advisor/frontend/.prettierrc b/community/ai-vws-sizing-advisor/frontend/.prettierrc
similarity index 100%
rename from community/vgpu-sizing-advisor/frontend/.prettierrc
rename to community/ai-vws-sizing-advisor/frontend/.prettierrc
diff --git a/community/vgpu-sizing-advisor/frontend/Dockerfile b/community/ai-vws-sizing-advisor/frontend/Dockerfile
similarity index 100%
rename from community/vgpu-sizing-advisor/frontend/Dockerfile
rename to community/ai-vws-sizing-advisor/frontend/Dockerfile
diff --git a/community/vgpu-sizing-advisor/frontend/LICENSE-3rd-party.txt b/community/ai-vws-sizing-advisor/frontend/LICENSE-3rd-party.txt
similarity index 100%
rename from community/vgpu-sizing-advisor/frontend/LICENSE-3rd-party.txt
rename to community/ai-vws-sizing-advisor/frontend/LICENSE-3rd-party.txt
diff --git a/community/vgpu-sizing-advisor/frontend/eslint.config.mjs b/community/ai-vws-sizing-advisor/frontend/eslint.config.mjs
similarity index 100%
rename from community/vgpu-sizing-advisor/frontend/eslint.config.mjs
rename to community/ai-vws-sizing-advisor/frontend/eslint.config.mjs
diff --git a/community/vgpu-sizing-advisor/frontend/next-env.d.ts b/community/ai-vws-sizing-advisor/frontend/next-env.d.ts
similarity index 100%
rename from community/vgpu-sizing-advisor/frontend/next-env.d.ts
rename to community/ai-vws-sizing-advisor/frontend/next-env.d.ts
diff --git a/community/vgpu-sizing-advisor/frontend/next.config.ts b/community/ai-vws-sizing-advisor/frontend/next.config.ts
similarity index 100%
rename from community/vgpu-sizing-advisor/frontend/next.config.ts
rename to community/ai-vws-sizing-advisor/frontend/next.config.ts
diff --git a/community/vgpu-sizing-advisor/frontend/package-lock.json b/community/ai-vws-sizing-advisor/frontend/package-lock.json
similarity index 100%
rename from community/vgpu-sizing-advisor/frontend/package-lock.json
rename to community/ai-vws-sizing-advisor/frontend/package-lock.json
diff --git a/community/vgpu-sizing-advisor/frontend/package.json b/community/ai-vws-sizing-advisor/frontend/package.json
similarity index 100%
rename from community/vgpu-sizing-advisor/frontend/package.json
rename to community/ai-vws-sizing-advisor/frontend/package.json
diff --git a/community/vgpu-sizing-advisor/frontend/postcss.config.mjs b/community/ai-vws-sizing-advisor/frontend/postcss.config.mjs
similarity index 100%
rename from community/vgpu-sizing-advisor/frontend/postcss.config.mjs
rename to community/ai-vws-sizing-advisor/frontend/postcss.config.mjs
diff --git a/community/vgpu-sizing-advisor/frontend/public/citations.svg b/community/ai-vws-sizing-advisor/frontend/public/citations.svg
similarity index 100%
rename from community/vgpu-sizing-advisor/frontend/public/citations.svg
rename to community/ai-vws-sizing-advisor/frontend/public/citations.svg
diff --git a/community/vgpu-sizing-advisor/frontend/public/collection.svg b/community/ai-vws-sizing-advisor/frontend/public/collection.svg
similarity index 100%
rename from community/vgpu-sizing-advisor/frontend/public/collection.svg
rename to community/ai-vws-sizing-advisor/frontend/public/collection.svg
diff --git a/community/vgpu-sizing-advisor/frontend/public/document.svg b/community/ai-vws-sizing-advisor/frontend/public/document.svg
similarity index 100%
rename from community/vgpu-sizing-advisor/frontend/public/document.svg
rename to community/ai-vws-sizing-advisor/frontend/public/document.svg
diff --git a/community/vgpu-sizing-advisor/frontend/public/empty-collections.svg b/community/ai-vws-sizing-advisor/frontend/public/empty-collections.svg
similarity index 100%
rename from community/vgpu-sizing-advisor/frontend/public/empty-collections.svg
rename to community/ai-vws-sizing-advisor/frontend/public/empty-collections.svg
diff --git a/community/vgpu-sizing-advisor/frontend/public/file.svg b/community/ai-vws-sizing-advisor/frontend/public/file.svg
similarity index 100%
rename from community/vgpu-sizing-advisor/frontend/public/file.svg
rename to community/ai-vws-sizing-advisor/frontend/public/file.svg
diff --git a/community/vgpu-sizing-advisor/frontend/public/globe.svg b/community/ai-vws-sizing-advisor/frontend/public/globe.svg
similarity index 100%
rename from community/vgpu-sizing-advisor/frontend/public/globe.svg
rename to community/ai-vws-sizing-advisor/frontend/public/globe.svg
diff --git a/community/vgpu-sizing-advisor/frontend/public/next.svg b/community/ai-vws-sizing-advisor/frontend/public/next.svg
similarity index 100%
rename from community/vgpu-sizing-advisor/frontend/public/next.svg
rename to community/ai-vws-sizing-advisor/frontend/public/next.svg
diff --git a/community/vgpu-sizing-advisor/frontend/public/nvidia-logo.svg b/community/ai-vws-sizing-advisor/frontend/public/nvidia-logo.svg
similarity index 100%
rename from community/vgpu-sizing-advisor/frontend/public/nvidia-logo.svg
rename to community/ai-vws-sizing-advisor/frontend/public/nvidia-logo.svg
diff --git a/community/vgpu-sizing-advisor/frontend/public/settings.svg b/community/ai-vws-sizing-advisor/frontend/public/settings.svg
similarity index 100%
rename from community/vgpu-sizing-advisor/frontend/public/settings.svg
rename to community/ai-vws-sizing-advisor/frontend/public/settings.svg
diff --git a/community/vgpu-sizing-advisor/frontend/public/vercel.svg b/community/ai-vws-sizing-advisor/frontend/public/vercel.svg
similarity index 100%
rename from community/vgpu-sizing-advisor/frontend/public/vercel.svg
rename to community/ai-vws-sizing-advisor/frontend/public/vercel.svg
diff --git a/community/vgpu-sizing-advisor/frontend/public/window.svg b/community/ai-vws-sizing-advisor/frontend/public/window.svg
similarity index 100%
rename from community/vgpu-sizing-advisor/frontend/public/window.svg
rename to community/ai-vws-sizing-advisor/frontend/public/window.svg
diff --git a/community/vgpu-sizing-advisor/frontend/src/app/api/apply-configuration/route.ts b/community/ai-vws-sizing-advisor/frontend/src/app/api/apply-configuration/route.ts
similarity index 100%
rename from community/vgpu-sizing-advisor/frontend/src/app/api/apply-configuration/route.ts
rename to community/ai-vws-sizing-advisor/frontend/src/app/api/apply-configuration/route.ts
diff --git a/community/vgpu-sizing-advisor/frontend/src/app/api/available-models/route.ts b/community/ai-vws-sizing-advisor/frontend/src/app/api/available-models/route.ts
similarity index 100%
rename from community/vgpu-sizing-advisor/frontend/src/app/api/available-models/route.ts
rename to community/ai-vws-sizing-advisor/frontend/src/app/api/available-models/route.ts
diff --git a/community/vgpu-sizing-advisor/frontend/src/app/api/collections/route.ts b/community/ai-vws-sizing-advisor/frontend/src/app/api/collections/route.ts
similarity index 100%
rename from community/vgpu-sizing-advisor/frontend/src/app/api/collections/route.ts
rename to community/ai-vws-sizing-advisor/frontend/src/app/api/collections/route.ts
diff --git a/community/vgpu-sizing-advisor/frontend/src/app/api/detect-gpu/route.ts b/community/ai-vws-sizing-advisor/frontend/src/app/api/detect-gpu/route.ts
similarity index 100%
rename from community/vgpu-sizing-advisor/frontend/src/app/api/detect-gpu/route.ts
rename to community/ai-vws-sizing-advisor/frontend/src/app/api/detect-gpu/route.ts
diff --git a/community/vgpu-sizing-advisor/frontend/src/app/api/documents/route.ts b/community/ai-vws-sizing-advisor/frontend/src/app/api/documents/route.ts
similarity index 100%
rename from community/vgpu-sizing-advisor/frontend/src/app/api/documents/route.ts
rename to community/ai-vws-sizing-advisor/frontend/src/app/api/documents/route.ts
diff --git a/community/vgpu-sizing-advisor/frontend/src/app/api/download-citation/route.ts b/community/ai-vws-sizing-advisor/frontend/src/app/api/download-citation/route.ts
similarity index 100%
rename from community/vgpu-sizing-advisor/frontend/src/app/api/download-citation/route.ts
rename to community/ai-vws-sizing-advisor/frontend/src/app/api/download-citation/route.ts
diff --git a/community/vgpu-sizing-advisor/frontend/src/app/api/generate/route.ts b/community/ai-vws-sizing-advisor/frontend/src/app/api/generate/route.ts
similarity index 100%
rename from community/vgpu-sizing-advisor/frontend/src/app/api/generate/route.ts
rename to community/ai-vws-sizing-advisor/frontend/src/app/api/generate/route.ts
diff --git a/community/vgpu-sizing-advisor/frontend/src/app/api/test-configuration/route.ts b/community/ai-vws-sizing-advisor/frontend/src/app/api/test-configuration/route.ts
similarity index 100%
rename from community/vgpu-sizing-advisor/frontend/src/app/api/test-configuration/route.ts
rename to community/ai-vws-sizing-advisor/frontend/src/app/api/test-configuration/route.ts
diff --git a/community/vgpu-sizing-advisor/frontend/src/app/api/utils/api-utils.ts b/community/ai-vws-sizing-advisor/frontend/src/app/api/utils/api-utils.ts
similarity index 100%
rename from community/vgpu-sizing-advisor/frontend/src/app/api/utils/api-utils.ts
rename to community/ai-vws-sizing-advisor/frontend/src/app/api/utils/api-utils.ts
diff --git a/community/vgpu-sizing-advisor/frontend/src/app/components/Chat/ApplyConfigurationForm.tsx b/community/ai-vws-sizing-advisor/frontend/src/app/components/Chat/ApplyConfigurationForm.tsx
similarity index 99%
rename from community/vgpu-sizing-advisor/frontend/src/app/components/Chat/ApplyConfigurationForm.tsx
rename to community/ai-vws-sizing-advisor/frontend/src/app/components/Chat/ApplyConfigurationForm.tsx
index d0004a2d2..4f60a9f01 100644
--- a/community/vgpu-sizing-advisor/frontend/src/app/components/Chat/ApplyConfigurationForm.tsx
+++ b/community/ai-vws-sizing-advisor/frontend/src/app/components/Chat/ApplyConfigurationForm.tsx
@@ -619,7 +619,7 @@ export default function ApplyConfigurationForm({
-
Apply Configuration
+
Deploy Locally
Deploy vLLM locally using Docker with your recommended configuration
@@ -715,8 +715,8 @@ export default function ApplyConfigurationForm({
: isSubmitting
? "Deploying..."
: isConfigurationComplete
- ? "Apply Configuration Again"
- : "Apply Configuration"}
+ ? "Deploy Locally Again"
+ : "Deploy Locally"}
diff --git a/community/ai-vws-sizing-advisor/frontend/src/app/components/Chat/Chat.tsx b/community/ai-vws-sizing-advisor/frontend/src/app/components/Chat/Chat.tsx
new file mode 100644
index 000000000..927e5e40d
--- /dev/null
+++ b/community/ai-vws-sizing-advisor/frontend/src/app/components/Chat/Chat.tsx
@@ -0,0 +1,794 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+"use client";
+
+import { useState, useRef, useEffect } from "react";
+import RightSidebar from "../RightSidebar/RightSidebar";
+import VGPUConfigCard from "./VGPUConfigCard";
+import WorkloadConfigWizard from "./WorkloadConfigWizard";
+import ApplyConfigurationForm from "./ApplyConfigurationForm";
+import ChatPanel from "../RightSidebar/ChatPanel";
+import { v4 as uuidv4 } from "uuid";
+import { API_CONFIG } from "@/app/config/api";
+import { marked } from "marked";
+import { useChatStream } from "../../hooks/useChatStream";
+import { ChatMessage, GenerateRequest } from "@/types/chat";
+import { useSettings } from "../../context/SettingsContext";
+import { useSidebar } from "../../context/SidebarContext";
+
+export default function Chat() {
+ const { activePanel, toggleSidebar, setActiveCitations } = useSidebar();
+ const [messages, setMessages] = useState([]);
+ const [isWizardOpen, setIsWizardOpen] = useState(false);
+ const [isApplyFormOpen, setIsApplyFormOpen] = useState(false);
+ const [applyFormConfig, setApplyFormConfig] = useState(null);
+ const [showPassthroughError, setShowPassthroughError] = useState(false);
+ const [lastVGPUConfig, setLastVGPUConfig] = useState(null); // Track last vGPU config for context
+ const [showChatPanel, setShowChatPanel] = useState(false); // Show inline chat panel
+ const [chatPanelHistory, setChatPanelHistory] = useState;
+ }>>([]);
+ const [isChatPanelLoading, setIsChatPanelLoading] = useState(false);
+ const { streamState, processStream, startStream, resetStream, stopStream } =
+ useChatStream();
+
+ const {
+ temperature,
+ topP,
+ vdbTopK,
+ rerankerTopK,
+ confidenceScoreThreshold,
+ useGuardrails,
+ includeCitations,
+ } = useSettings();
+
+ const messagesEndRef = useRef(null);
+
+ const handleToggleSidebar = (
+ panel: "citations",
+ citations?: {
+ text: string;
+ source: string;
+ document_type: "text" | "image" | "table" | "chart";
+ }[]
+ ) => {
+ if (panel === "citations" && citations) {
+ setActiveCitations(citations);
+ if (!activePanel || activePanel !== "citations") {
+ toggleSidebar(panel);
+ }
+ } else {
+ toggleSidebar(panel);
+ }
+ };
+
+ const scrollToBottom = () => {
+ messagesEndRef.current?.scrollIntoView({ behavior: "smooth" });
+ };
+
+ useEffect(() => {
+ scrollToBottom();
+
+ // Update citations in sidebar if panel is already open
+ const lastMessage = messages[messages.length - 1];
+ if (lastMessage && lastMessage.role === "assistant" && lastMessage.citations && lastMessage.citations.length > 0) {
+ // Only update citations if the panel is already open
+ if (activePanel === "citations") {
+ setActiveCitations(lastMessage.citations);
+ }
+ }
+ }, [messages, activePanel, setActiveCitations]);
+
+ // Separate effect to extract vGPU config (only depends on messages, not activePanel)
+ useEffect(() => {
+ const lastMessage = messages[messages.length - 1];
+ if (lastMessage && lastMessage.role === "assistant" && lastMessage.content) {
+ try {
+ const parsed = JSON.parse(lastMessage.content.trim());
+ if (parsed.title === "generate_vgpu_config" && parsed.parameters) {
+ // Only reset chat history if this is a NEW config (different from last one)
+ setLastVGPUConfig((prevConfig: any) => {
+ const prevProfileId = prevConfig?.parameters?.vgpu_profile || prevConfig?.parameters?.vGPU_profile;
+ const newProfileId = parsed.parameters?.vgpu_profile || parsed.parameters?.vGPU_profile;
+
+ // Only reset chat history if this is actually a new config
+ if (prevProfileId !== newProfileId || !prevConfig) {
+ setChatPanelHistory([]);
+ }
+
+ return parsed;
+ });
+ }
+ } catch {
+ // Not a JSON config, ignore
+ }
+ }
+ }, [messages]);
+
+ const handleSubmit = async (message: string) => {
+ if (!message.trim()) return;
+
+ resetStream();
+ const controller = startStream();
+
+ const userMessage = createUserMessage(message);
+ const assistantMessage = createAssistantMessage();
+
+ setMessages((prev) => [...prev, userMessage, assistantMessage]);
+
+ // Debug confidence score threshold being used
+ console.log(`Submitting with confidence threshold: ${confidenceScoreThreshold} (value type: ${typeof confidenceScoreThreshold})`);
+
+ try {
+ const response = await fetch("/api/generate", {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify(createRequestBody(userMessage)),
+ signal: controller.signal,
+ });
+
+ if (!response.ok)
+ throw new Error(`HTTP error! status: ${response.status}`);
+
+ await processStream(response, assistantMessage.id, setMessages, confidenceScoreThreshold);
+ } catch (error: unknown) {
+ if (error instanceof Error && error.name === "AbortError") {
+ console.log("Stream aborted");
+ return;
+ }
+ console.error("Error generating response:", error);
+ handleError(assistantMessage.id);
+ }
+ };
+
+ const isVGPUConfig = (content: string): boolean => {
+ try {
+ const parsed = JSON.parse(content.trim());
+ return parsed.title === "generate_vgpu_config" && parsed.parameters;
+ } catch {
+ return false;
+ }
+ };
+
+ const renderMessageContent = (content: string, isTyping: boolean, messageId: string) => {
+ if (isTyping) {
+ return (
+
+
+
+ Generating configuration...
+
+
+ );
+ }
+
+ // Check if content is a vGPU configuration JSON
+ if (isVGPUConfig(content)) {
+ try {
+ const vgpuConfig = JSON.parse(content.trim());
+
+ // Return a preview card with inline details AND chat panel (always expanded)
+ return (
+
+
+
+
+
+
vGPU Configuration Suggestion
+
+
+
+ {(() => {
+ // Only highlight the LAST occurrence of (FP8)/(FP16)/(FP4) - the precision indicator
+ const desc = vgpuConfig.description;
+ const precisionMatch = desc.match(/^(.*)\((FP[4816]+)\)(\s*)$/i);
+ if (precisionMatch) {
+ // Split the non-precision part for Inference/RAG highlighting
+ const mainPart = precisionMatch[1];
+ const precision = precisionMatch[2];
+ return (
+ <>
+ {mainPart.split(/(Inference|RAG)/gi).map((part: string, i: number) =>
+ /^(Inference|RAG)$/i.test(part) ? (
+ {part}
+ ) : part
+ )}
+ ({precision.toUpperCase()})
+ >
+ );
+ }
+ // No precision suffix, just highlight Inference/RAG
+ return desc.split(/(Inference|RAG)/gi).map((part: string, i: number) =>
+ /^(Inference|RAG)$/i.test(part) ? (
+ {part}
+ ) : part
+ );
+ })()}
+
+ {/* Configuration Details - 70% on large screens, 100% on small - NO scrollbar */}
+
+
+
+
+ {/* Chat Panel - 30% on large screens (right side), 100% on small (below)
+ Height adjusts based on workload type: RAG configs need more space */}
+
+
+
+
+
+ {/* Advanced Details - Full width below both panels */}
+
+
+
+
+
+ {/* Divider Line */}
+
+
+ {/* Action Buttons - Side by Side */}
+
+ This workload requires direct GPU access and cannot be tested with the local vLLM deployment feature.
+
+
+
Why is this happening?
+
+ Your workload exceeds the maximum vGPU profile capacity and requires full GPU passthrough mode. This configuration must be deployed directly on hardware with GPU passthrough enabled.
+
+
+
+ Please deploy this configuration on your production environment with the recommended GPU passthrough setup.
+
+
+
+
+ {/* Footer */}
+
+
+
+
+
+ )}
+
+ );
+}
\ No newline at end of file
diff --git a/community/vgpu-sizing-advisor/frontend/src/app/components/Chat/MessageInput.tsx b/community/ai-vws-sizing-advisor/frontend/src/app/components/Chat/MessageInput.tsx
similarity index 100%
rename from community/vgpu-sizing-advisor/frontend/src/app/components/Chat/MessageInput.tsx
rename to community/ai-vws-sizing-advisor/frontend/src/app/components/Chat/MessageInput.tsx
diff --git a/community/vgpu-sizing-advisor/frontend/src/app/components/Chat/VGPUConfigCard.tsx b/community/ai-vws-sizing-advisor/frontend/src/app/components/Chat/VGPUConfigCard.tsx
similarity index 69%
rename from community/vgpu-sizing-advisor/frontend/src/app/components/Chat/VGPUConfigCard.tsx
rename to community/ai-vws-sizing-advisor/frontend/src/app/components/Chat/VGPUConfigCard.tsx
index 54889e8b8..940ce5571 100644
--- a/community/vgpu-sizing-advisor/frontend/src/app/components/Chat/VGPUConfigCard.tsx
+++ b/community/ai-vws-sizing-advisor/frontend/src/app/components/Chat/VGPUConfigCard.tsx
@@ -15,7 +15,32 @@
"use client";
-import { useState, ReactNode, useRef } from "react";
+import { useState, ReactNode } from "react";
+
+// Tooltip trigger component - displays content in card's bottom banner
+const TooltipTrigger = ({
+ content,
+ children,
+ onShow,
+ onHide
+}: {
+ content: string;
+ children: ReactNode;
+ onShow: (content: string) => void;
+ onHide: () => void;
+}) => {
+ return (
+