diff --git a/.gitignore b/.gitignore index a707cb164..560b6f5d4 100644 --- a/.gitignore +++ b/.gitignore @@ -63,3 +63,5 @@ docs/**/config # Ignoring log files generated by tests firebase.json scratch.py + +.env diff --git a/docs/user-guides/community/active-fence.md b/docs/user-guides/community/active-fence.md index 688216f29..0c02b0d88 100644 --- a/docs/user-guides/community/active-fence.md +++ b/docs/user-guides/community/active-fence.md @@ -31,7 +31,7 @@ ActiveFence’s ActiveScore API gives flexibility in controlling the behavior of ```colang define flow activefence input moderation detailed - $result = execute call activefence api(text=$user_message) + $result = execute call activefence api if $result.violations.get("abusive_or_harmful.hate_speech", 0) > 0.8 bot inform cannot engage in abusive or harmful behavior diff --git a/docs/user-guides/community/prompt-security.md b/docs/user-guides/community/prompt-security.md new file mode 100644 index 000000000..53ee3b631 --- /dev/null +++ b/docs/user-guides/community/prompt-security.md @@ -0,0 +1,42 @@ +# Prompt Security Integration + +[Prompt Security AI](https://prompt.security/?utm_medium=github&utm_campaign=nemo-guardrails) allows you to protect LLM interaction. This integration enables NeMo Guardrails to use Prompt Security to protect input and output flows. + +You'll need to set the following env variables to work with Prompt Security: + +1. PS_PROTECT_URL - This is the URL of the protect endpoint given by Prompt Security. This will look like https://[REGION].prompt.security/api/protect where REGION is eu, useast or apac +2. PS_APP_ID - This is the application ID given by Prompt Security (similar to an API key). You can get it from admin portal at https://[REGION].prompt.security/ where REGION is eu, useast or apac + +## Setup + +1. Ensure that you have access to Prompt Security API server (SaaS or on-prem). +2. Update your `config.yml` file to include the Private AI settings: + +```yaml +rails: + input: + flows: + - protect prompt + output: + flows: + - protect response +``` + +Don't forget to set the `PS_PROTECT_URL` and `PS_APP_ID` environment variables. + +## Usage + +Once configured, the Prompt Security integration will automatically: + +1. Protect prompts before they are processed by the LLM. +2. Protect LLM outputs before they are sent back to the user. + +The `protect_text` action in `nemoguardrails/library/prompt_security/actions.py` handles the protection process. + +## Error Handling + +If the Prompt Security API request fails, it's operating in a fail-open mode (not blocking the prompt/response). + +## Notes + +For more information on Prompt Security and capabilities, please refer to the [Prompt Security documentation](https://prompt.security/?utm_medium=github&utm_campaign=nemo-guardrails). diff --git a/docs/user-guides/guardrails-library.md b/docs/user-guides/guardrails-library.md index 14b84fe44..a2a2d9c34 100644 --- a/docs/user-guides/guardrails-library.md +++ b/docs/user-guides/guardrails-library.md @@ -23,6 +23,7 @@ NeMo Guardrails comes with a library of built-in guardrails that you can easily - [Cleanlab Trustworthiness Score](#cleanlab) - [GCP Text Moderation](#gcp-text-moderation) - [Private AI PII detection](#private-ai-pii-detection) + - [Prompt Security Protection](#prompt-security-protection) - OpenAI Moderation API - *[COMING SOON]* 4. Other @@ -772,6 +773,27 @@ rails: For more details, check out the [Private AI Integration](./community/privateai.md) page. + +### Prompt Security Protection + +NeMo Guardrails supports using [Prompt Security API](https://prompt.security/?utm_medium=github&utm_campaign=nemo-guardrails) for protecting input and output retrieval flows. + +To activate the protection, you need to set the `PS_PROTECT_URL` and `PS_APP_ID` environment variables. + +#### Example usage + +```yaml +rails: + input: + flows: + - protect prompt + output: + flows: + - protect response +``` + +For more details, check out the [Prompt Security Integration](./community/prompt_security.md) page. + ## Other ### Jailbreak Detection Heuristics diff --git a/examples/configs/prompt_security/README.md b/examples/configs/prompt_security/README.md new file mode 100644 index 000000000..d1e8ba0fc --- /dev/null +++ b/examples/configs/prompt_security/README.md @@ -0,0 +1,5 @@ +# Prompt Security Configuration Example + +This example contains configuration files for using Prompt Security in your NeMo Guardrails project. + +For more details on the Prompt Security integration, see [Prompt Security Integration User Guide](../../../docs/user-guides/community/prompt-security.md). diff --git a/examples/configs/prompt_security/config.yml b/examples/configs/prompt_security/config.yml new file mode 100644 index 000000000..b33707008 --- /dev/null +++ b/examples/configs/prompt_security/config.yml @@ -0,0 +1,13 @@ +models: + - type: main + engine: openai + model: gpt-4o + +rails: + input: + flows: + - protect prompt + + output: + flows: + - protect response diff --git a/nemoguardrails/library/prompt_security/__init__.py b/nemoguardrails/library/prompt_security/__init__.py new file mode 100644 index 000000000..9ba9d4310 --- /dev/null +++ b/nemoguardrails/library/prompt_security/__init__.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemoguardrails/library/prompt_security/actions.py b/nemoguardrails/library/prompt_security/actions.py new file mode 100644 index 000000000..b37135eb3 --- /dev/null +++ b/nemoguardrails/library/prompt_security/actions.py @@ -0,0 +1,126 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Prompt/Response protection using Prompt Security.""" + +import logging +import os +from typing import Optional + +import httpx + +from nemoguardrails.actions import action + +log = logging.getLogger(__name__) + + +async def ps_protect_api_async( + ps_protect_url: str, + ps_app_id: str, + prompt: Optional[str] = None, + system_prompt: Optional[str] = None, + response: Optional[str] = None, + user: Optional[str] = None, +): + """Calls Prompt Security Protect API asynchronously. + + Args: + ps_protect_url: the URL of the protect endpoint given by Prompt Security. + URL is https://[REGION].prompt.security/api/protect where REGION is eu, useast or apac + + ps_app_id: the application ID given by Prompt Security (similar to an API key). + Get it from the admin portal at https://[REGION].prompt.security/ where REGION is eu, useast or apac + + prompt: the user message to protect. + + system_prompt: the system message for context. + + response: the bot message to protect. + + user: the user ID or username for context. + + Returns: + A dictionary with the following items: + - is_blocked: True if the text should be blocked, False otherwise. + - is_modified: True if the text should be modified, False otherwise. + - modified_text: The modified text if is_modified is True, None otherwise. + """ + + headers = { + "APP-ID": ps_app_id, + "Content-Type": "application/json", + } + payload = { + "prompt": prompt, + "system_prompt": system_prompt, + "response": response, + "user": user, + } + async with httpx.AsyncClient() as client: + modified_text = None + ps_action = "log" + try: + ret = await client.post(ps_protect_url, headers=headers, json=payload) + res = ret.json() + ps_action = res.get("result", {}).get("action", "log") + if ps_action == "modify": + key = "response" if response else "prompt" + modified_text = res.get("result", {}).get(key, {}).get("modified_text") + except Exception as e: + log.error("Error calling Prompt Security Protect API: %s", e) + return { + "is_blocked": ps_action == "block", + "is_modified": ps_action == "modify", + "modified_text": modified_text, + } + + +@action(is_system_action=True) +async def protect_text( + user_prompt: Optional[str] = None, bot_response: Optional[str] = None +): + """Protects the given user_prompt or bot_response. + Args: + user_prompt: The user message to protect. + bot_response: The bot message to protect. + Returns: + A dictionary with the following items: + - is_blocked: True if the text should be blocked, False otherwise. + - is_modified: True if the text should be modified, False otherwise. + - modified_text: The modified text if is_modified is True, None otherwise. + Raises: + ValueError is returned in one of the following cases: + 1. If PS_PROTECT_URL env variable is not set. + 2. If PS_APP_ID env variable is not set. + 3. If no user_prompt and no bot_response is provided. + """ + + ps_protect_url = os.getenv("PS_PROTECT_URL") + if not ps_protect_url: + raise ValueError("PS_PROTECT_URL env variable is required for Prompt Security.") + + ps_app_id = os.getenv("PS_APP_ID") + if not ps_app_id: + raise ValueError("PS_APP_ID env variable is required for Prompt Security.") + + if bot_response: + return await ps_protect_api_async( + ps_protect_url, ps_app_id, None, None, bot_response + ) + + if user_prompt: + return await ps_protect_api_async(ps_protect_url, ps_app_id, user_prompt) + + raise ValueError("Neither user_message nor bot_message was provided") diff --git a/nemoguardrails/library/prompt_security/flows.co b/nemoguardrails/library/prompt_security/flows.co new file mode 100644 index 000000000..6d5d691dc --- /dev/null +++ b/nemoguardrails/library/prompt_security/flows.co @@ -0,0 +1,24 @@ +# INPUT RAILS + +@active +flow protect prompt + """Check if the prompt is valid according to Prompt Security.""" + $result = await protect_text(user_prompt=$user_message) + if $result["is_blocked"] + bot inform answer unknown + stop + else if $result["is_modified"] + $user_message = $result["modified_text"] + + +# OUTPUT RAILS + +@active +flow protect response + """Check if the response is valid according to Prompt Security.""" + $result = await protect_text(bot_response=$bot_message) + if $result["is_blocked"] + bot inform answer unknown + stop + else if $result["is_modified"] + $bot_message = $result["modified_text"] diff --git a/nemoguardrails/library/prompt_security/flows.v1.co b/nemoguardrails/library/prompt_security/flows.v1.co new file mode 100644 index 000000000..04b747d16 --- /dev/null +++ b/nemoguardrails/library/prompt_security/flows.v1.co @@ -0,0 +1,22 @@ +# INPUT RAILS + +define subflow protect prompt + """Check if the prompt is valid according to Prompt Security.""" + $result = execute protect_text(user_prompt=$user_message) + if $result["is_blocked"] + bot inform answer unknown + stop + else if $result["is_modified"] + $user_message = $result["modified_text"] + + +# OUTPUT RAILS + +define subflow protect response + """Check if the response is valid according to Prompt Security.""" + $result = execute protect_text(bot_response=$bot_message) + if $result["is_blocked"] + bot inform answer unknown + stop + else if $result["is_modified"] + $bot_message = $result["modified_text"] diff --git a/tests/test_prompt_security.py b/tests/test_prompt_security.py new file mode 100644 index 000000000..3676e4165 --- /dev/null +++ b/tests/test_prompt_security.py @@ -0,0 +1,133 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from nemoguardrails import RailsConfig +from tests.utils import TestChat + + +def mock_protect_text(return_value): + def mock_request(*args, **kwargs): + return return_value + + return mock_request + + +@pytest.mark.unit +def test_prompt_security_protection_disabled(): + config = RailsConfig.from_content( + colang_content=""" + define user express greeting + "hi" + + define flow + user express greeting + bot express greeting + + define bot inform answer unknown + "I can't answer that." + """, + ) + + chat = TestChat( + config, + llm_completions=[ + " express greeting", + ' "Hi! My name is John as well."', + ], + ) + + chat.app.register_action( + mock_protect_text({"is_blocked": True, "is_modified": False}), "protect_text" + ) + chat >> "Hi! I am Mr. John! And my email is test@gmail.com" + chat << "Hi! My name is John as well." + + +@pytest.mark.unit +def test_prompt_security_protection_input(): + config = RailsConfig.from_content( + yaml_content=""" + models: [] + rails: + input: + flows: + - protect prompt + """, + colang_content=""" + define user express greeting + "hi" + + define flow + user express greeting + bot express greeting + + define bot inform answer unknown + "I can't answer that." + """, + ) + + chat = TestChat( + config, + llm_completions=[ + " express greeting", + ' "Hi! My name is John as well."', + ], + ) + + chat.app.register_action( + mock_protect_text({"is_blocked": True, "is_modified": False}), "protect_text" + ) + chat >> "Hi! I am Mr. John! And my email is test@gmail.com" + chat << "I can't answer that." + + +@pytest.mark.unit +def test_prompt_security_protection_output(): + config = RailsConfig.from_content( + yaml_content=""" + models: [] + rails: + output: + flows: + - protect response + """, + colang_content=""" + define user express greeting + "hi" + + define flow + user express greeting + bot express greeting + + define bot inform answer unknown + "I can't answer that." + """, + ) + + chat = TestChat( + config, + llm_completions=[ + " express greeting", + ' "Hi! My name is John as well."', + ], + ) + + chat.app.register_action( + mock_protect_text({"is_blocked": True, "is_modified": False}), "protect_text" + ) + chat >> "Hi!" + chat << "I can't answer that."