In [1]:
!git clone https://github.com/NVIDIA/NeMo-Guardrails.git

Cloning into 'NeMo-Guardrails'...
remote: Enumerating objects: 6118, done.[K
remote: Counting objects: 100% (1869/1869), done.[K
remote: Compressing objects: 100% (491/491), done.[K
remote: Total 6118 (delta 1483), reused 1618 (delta 1371), pack-reused 4249[K
Receiving objects: 100% (6118/6118), 18.26 MiB | 18.41 MiB/s, done.
Resolving deltas: 100% (3937/3937), done.


In [1]:
cd /content/NeMo-Guardrails/examples/configs/llm

/content/NeMo-Guardrails/examples/configs/llm


In [3]:
%%capture

!pip install transformers==4.33.1 --upgrade
!pip install nemoguardrails --upgrade
!pip install langchain --upgrade
!pip install accelerate --upgrade
!pip install spacy --upgrade #Optional
!pip install datasets bitsandbytes einops  -Uqqq

In [2]:
# Important to be separated into different cell
import nest_asyncio
nest_asyncio.apply()

In [3]:
# Useful for debugging
import logging
logging.basicConfig(level=logging.DEBUG)

import accelerate
import bitsandbytes
import torch

In [4]:

import os
os.environ['HF_TOKEN'] = ""

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
import torch

MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,            # load model in 4-bit precision
    bnb_4bit_quant_type="nf4",    # pre-trained model should be quantized in 4-bit NF format
    bnb_4bit_use_double_quant=True, # Using double quantization as mentioned in QLoRA paper
    bnb_4bit_compute_dtype=torch.bfloat16,
    # During computation, pre-trained model should be loaded in BF16 format
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config = bnb_config,
    device_map = 'auto',
    use_cache=True,
    trust_remote_code=True,
#     use_flash_attention_2 = True
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=4096,
    do_sample=True,
    temperature=0.2,
    top_p=0.95,
    logprobs=None,
    top_k=40,
    repetition_penalty=1.1
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [23]:
from nemoguardrails.llm.providers import (
    HuggingFacePipelineCompatible,
    register_llm_provider,
)



In [22]:
from nemoguardrails.llm.helpers import get_llm_instance_wrapper
from nemoguardrails.llm.providers import register_llm_provider

hf_llm = HuggingFacePipelineCompatible(pipeline=pipe)
provider = get_llm_instance_wrapper(
    llm_instance=hf_llm, llm_type="hf_pipeline_llama2_13b"
)
register_llm_provider("hf_pipeline_llama2_13b", provider)

In [24]:
from pathlib import Path
from nemoguardrails import LLMRails, RailsConfig




# Load a guardrails configuration from the specified path.
path_to_config = "/content/NeMo-Guardrails/examples/configs/llm/hf_pipeline_llama2/config.yml"
config = RailsConfig.from_path(str(path_to_config))
rails = LLMRails(config)

completion = rails.generate(
    messages=[{"role": "user", "content": "What is the biggest city in the world?"}]
)

print(completion)

{'role': 'assistant', 'content': 'The largest city in the world by population is Tokyo, Japan with an estimated 38 million people as of 2022.'}


In [None]:
https://github.com/NVIDIA/NeMo-Guardrails/issues/238 #Visit this thread for error with checkfacts

In [14]:
# config.yml #File  -- Change check facts to self check facts in flow and task to self_check_facts instead of fact_checking at last
# models:
#   - type: main
#     engine: hf_pipeline_llama2_13b
#     parameters:
#       path: "meta-llama/Llama-2-13b-chat-hf"

#       # number of GPUs you have , do nvidia-smi to check
#       num_gpus: 2

#       # This can be: "cpu" or "cuda". "mps" is not supported.
#       device: "cuda"

# rails:
#   output:
#     flows:
#       - self check facts

# instructions:
#   - type: general
#     content: |
#       Below is a conversation between a bot and a user about the recent job reports.
#       The bot is factual and concise. If the bot does not know the answer to a
#       question, it truthfully says it does not know.

# sample_conversation: |
#   user "Hello there!"
#     express greeting
#   bot express greeting
#     "Hello! How can I assist you today?"
#   user "What can you do for me?"
#     ask about capabilities
#   bot respond about capabilities
#     "I am an AI assistant which helps answer questions based on a given knowledge base."

# # The prompts below are the same as the ones from `nemoguardrails/llm/prompts/dolly.yml`.
# prompts:
#   - task: general
#     models:
#       - hf_pipeline_llama2_13b
#     content: |-
#       {{ general_instructions }}

#       {{ history | user_assistant_sequence }}
#       Assistant:

#   # Prompt for detecting the user message canonical form.
#   - task: generate_user_intent
#     models:
#       - hf_pipeline_llama2_13b
#     content: |-
#       """
#       {{ general_instructions }}
#       """

#       # This is how a conversation between a user and the bot can go:
#       {{ sample_conversation | verbose_v1 }}

#       # This is how the user talks:
#       {{ examples | verbose_v1 }}

#       # This is the current conversation between the user and the bot:
#       {{ sample_conversation | first_turns(2) | verbose_v1 }}
#       {{ history | colang | verbose_v1 }}

#     output_parser: "verbose_v1"

#   # Prompt for generating the next steps.
#   - task: generate_next_steps
#     models:
#       - hf_pipeline_llama2_13b
#     content: |-
#       """
#       {{ general_instructions }}
#       """

#       # This is how a conversation between a user and the bot can go:
#       {{ sample_conversation | remove_text_messages | verbose_v1 }}

#       # This is how the bot thinks:
#       {{ examples | remove_text_messages | verbose_v1 }}

#       # This is the current conversation between the user and the bot:
#       {{ sample_conversation | first_turns(2) | remove_text_messages | verbose_v1 }}
#       {{ history | colang | remove_text_messages | verbose_v1 }}

#     output_parser: "verbose_v1"

#   # Prompt for generating the bot message from a canonical form.
#   - task: generate_bot_message
#     models:
#       - hf_pipeline_llama2_13b
#     content: |-
#       """
#       {{ general_instructions }}
#       """

#       # This is how a conversation between a user and the bot can go:
#       {{ sample_conversation | verbose_v1 }}

#       {% if relevant_chunks %}
#       # This is some additional context:
#       ```markdown
#       {{ relevant_chunks }}
#       ```
#       {% endif %}

#       # This is how the bot talks:
#       {{ examples | verbose_v1 }}

#       # This is the current conversation between the user and the bot:
#       {{ sample_conversation | first_turns(2) | verbose_v1 }}
#       {{ history | colang | verbose_v1 }}

#     output_parser: "verbose_v1"

#   # Prompt for generating the value of a context variable.
#   - task: generate_value
#     models:
#       - hf_pipeline_llama2_13b
#     content: |-
#       """
#       {{ general_instructions }}
#       """

#       # This is how a conversation between a user and the bot can go:
#       {{ sample_conversation | verbose_v1 }}

#       # This is how the bot thinks:
#       {{ examples | verbose_v1 }}

#       # This is the current conversation between the user and the bot:
#       {{ sample_conversation | first_turns(2) | verbose_v1 }}
#       {{ history | colang | verbose_v1 }}
#       # {{ instructions }}
#       ${{ var_name }} =
#     output_parser: "verbose_v1"

#   - task: self_check_facts
#     models:
#       - hf_pipeline_llama2_13b
#     content: |-
#       <<SYS>>
#       You are given a task to identify if the hypothesis is grounded and entailed to the evidence.
#       You will only use the contents of the evidence and not rely on external knowledge.
#       <</SYS>>

#       [INST]Answer with yes/no. "evidence": {{ evidence }} "hypothesis": {{ response }} "entails":[/INST]
