# DEMO

In [None]:
# Install dependencies
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install -q transformers accelerate sentencepiece gradio numpy scipy librosa unidecode inflect

In [None]:
!pip install transformers

# Clean up broken/incompatible versions
!pip uninstall torch torchvision numpy -y

# Install PyTorch >= 2.1 with CUDA 11.8
!pip install torch==2.1.0+cu118 torchvision==0.16.0+cu118 --index-url https://download.pytorch.org/whl/cu118

# Downgrade NumPy to a compatible version
!pip install numpy==1.24.4


Found existing installation: torch 2.1.0+cu118
Uninstalling torch-2.1.0+cu118:
  Successfully uninstalled torch-2.1.0+cu118
Found existing installation: torchvision 0.16.0+cu118
Uninstalling torchvision-0.16.0+cu118:
  Successfully uninstalled torchvision-0.16.0+cu118
Found existing installation: numpy 1.24.4
Uninstalling numpy-1.24.4:
  Successfully uninstalled numpy-1.24.4
Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch==2.1.0+cu118
  Using cached https://download.pytorch.org/whl/cu118/torch-2.1.0%2Bcu118-cp311-cp311-linux_x86_64.whl (2325.9 MB)
Collecting torchvision==0.16.0+cu118
  Using cached https://download.pytorch.org/whl/cu118/torchvision-0.16.0%2Bcu118-cp311-cp311-linux_x86_64.whl (6.2 MB)
Collecting numpy (from torchvision==0.16.0+cu118)
  Using cached https://download.pytorch.org/whl/numpy-2.1.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Using cached https://download.pytorch.org/whl/numpy-2.1.2-cp311-cp311-manyli

In [None]:
# Imports
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from huggingface_hub import login
from google.colab import userdata
import gradio as gr

In [None]:
# Hugging Face auth
hf_token = "xxxxx"
login(token=hf_token)

In [None]:
# Load LLaMA-3.2-1B-Instruct
model_name = "meta-llama/Llama-3.2-1B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
generation_args = {
    "max_new_tokens": 300,
    "return_full_text": False,
    "temperature": 0.3,
    "do_sample": True,
}

Device set to use cuda


In [None]:
!pip install -U datasets



In [None]:
from datasets import load_dataset, concatenate_datasets

# Load the RAG dataset from Hugging Face
dataset = load_dataset("neural-bridge/rag-dataset-12000")

train_set = dataset["train"]  # Train split
test_set = dataset["test"]  # Test split

In [None]:
# Remove examples where any field is empty or just whitespace from both train set and test set
print(f"Original size train: {len(train_set)}")
train_set = train_set.filter(lambda example: bool(example["context"] is not None and example["context"].strip()))
train_set = train_set.filter(lambda example: bool(example["question"] is not None and example["question"].strip()))
train_set = train_set.filter(lambda example: bool(example["answer"] is not None and example["answer"].strip()))
print(f"Size train after removing NaNs: {len(train_set)}\n")

print(f"Original size test: {len(test_set)}")
test_set = test_set.filter(lambda example: bool(example["context"] is not None and example["context"].strip()))
test_set = test_set.filter(lambda example: bool(example["question"] is not None and example["question"].strip()))
test_set = test_set.filter(lambda example: bool(example["answer"] is not None and example["answer"].strip()))
print(f"Size train after removing NaNs: {len(test_set)}")

Original size train: 9600
Size train after removing NaNs: 9598

Original size test: 2400
Size train after removing NaNs: 2399


In [None]:
full_set = concatenate_datasets([train_set, test_set])

In [None]:
# Mock document corpus
doc_corpus = full_set["context"]

# Realistic simple retriever
class DocRetrieveRerank:
    def __init__(self, docs, semb_model=None, xenc_model=None, device='cuda'):
        self.docs = docs

    def retrieve(self, query, n_docs=2):
        query_words = set(query.lower().split())
        ranked = sorted(self.docs, key=lambda doc: len(query_words & set(doc.lower().split())), reverse=True)
        return ranked[:n_docs], None

context_retriever = DocRetrieveRerank(docs=doc_corpus)

# Response generators
def generate_response(system_prompt, user_prompt, pipe, generation_args):
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]
    output = pipe(messages, **generation_args)
    return output[0]['generated_text']

def get_response(query, pipe, generation_args):
    system_prompt = "You are a friendly chatbot. You answer user questions."
    user_prompt = f"Answer the user question: {query}"
    return generate_response(system_prompt, user_prompt, pipe, generation_args)

def get_rag_response(query, context_retriever, pipe, generation_args):
    n_docs = 2
    most_rel_docs, _ = context_retriever.retrieve(query, n_docs)
    system_prompt = (
        "You are a friendly chatbot. You answer user questions based on the question and the context documents you are provided with."
    )
    user_prompt = (
        f"Answer the user question: {query}\n\n"
        f"Here are some context documents that could be useful to answer:\n```\n{chr(10).join(most_rel_docs)}\n```"
    )
    return generate_response(system_prompt, user_prompt, pipe, generation_args)

# Load Tacotron2 + WaveGlow from TorchHub
tacotron2 = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tacotron2', model_math='fp16', trust_repo=True).to('cuda').eval()
waveglow = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_waveglow', model_math='fp16', trust_repo=True).to('cuda').eval()
utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tts_utils', trust_repo=True)

# Remove weight norm (only once at startup)
from torch.nn.utils import remove_weight_norm
for layer in waveglow.modules():
    try:
        remove_weight_norm(layer)
    except Exception:
        pass
waveglow = waveglow.half()

# Real TTS synthesis
def synthesize_tts(text):
    sequences, lengths = utils.prepare_input_sequence([text])
    sequences = sequences.to('cuda')
    lengths = lengths.to('cuda')
    with torch.no_grad():
        mel, _, _ = tacotron2.infer(sequences, lengths)
        mel = mel.half()
        audio = waveglow.infer(mel)
    return (22050, audio[0].data.cpu().numpy())

# Gradio UI
def chat_with_rag(user_input, chat_history):
    try:
        standard = get_response(user_input, pipe, generation_args)
        rag = get_rag_response(user_input, context_retriever, pipe, generation_args)
        chat_history = chat_history or []
        chat_history.append(("You", user_input))
        chat_history.append(("Bot", f" Standard:\n{standard}\n\n RAG:\n{rag}"))
        audio = synthesize_tts(rag)
        return chat_history, audio
    except Exception as e:
        chat_history.append(("System", f"⚠️ Error: {e}"))
        return chat_history, None

# Launch UI
with gr.Blocks() as demo:
    gr.Markdown("# Voice Bot")
    chatbot = gr.Chatbot(label="Chat History")
    user_input = gr.Textbox(label="Ask me something:")
    submit_btn = gr.Button("Submit")
    audio_output = gr.Audio(label="RAG Answer")
    chat_state = gr.State([])

    submit_btn.click(fn=chat_with_rag,
                     inputs=[user_input, chat_state],
                     outputs=[chatbot, audio_output])

demo.launch()

Using cache found in /root/.cache/torch/hub/NVIDIA_DeepLearningExamples_torchhub
Using cache found in /root/.cache/torch/hub/NVIDIA_DeepLearningExamples_torchhub
  WeightNorm.apply(module, name, dim)
Using cache found in /root/.cache/torch/hub/NVIDIA_DeepLearningExamples_torchhub
  chatbot = gr.Chatbot(label="Chat History")


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ac4b5ae79c441efb8f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


