In [None]:
# !pip

In [47]:
import os
import getpass

if not os.environ.get("NVIDIA_API_KEY", "").startswith("nvapi-"):
    nvapi_key = getpass.getpass("Enter your NVIDIA API key: ")
    assert nvapi_key.startswith("nvapi-"), f"{nvapi_key[:5]}... is not a valid key"
    os.environ["NVIDIA_API_KEY"] = nvapi_key
    os.environ["NGC_API_KEY"] = nvapi_key


In [8]:
import os
from langchain.chains import ConversationalRetrievalChain, LLMChain
from langchain.chains.conversational_retrieval.prompts import CONDENSE_QUESTION_PROMPT, QA_PROMPT
from langchain.chains.question_answering import load_qa_chain
from langchain.memory import ConversationBufferMemory
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_nvidia_ai_endpoints import ChatNVIDIA
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings
from pydantic.v1 import BaseModel
from langchain_nvidia_ai_endpoints._statics import Model, register_model

In [9]:
from langchain_nvidia_ai_endpoints import ChatNVIDIA

In [10]:
ChatNVIDIA.get_available_models()



[Model(id='deepseek-ai/deepseek-coder-6.7b-instruct', model_type='chat', client='ChatNVIDIA', endpoint=None, aliases=['ai-deepseek-coder-6_7b-instruct'], supports_tools=False, base_model=None),
 Model(id='nvidia/neva-22b', model_type='vlm', client='ChatNVIDIA', endpoint='https://ai.api.nvidia.com/v1/vlm/nvidia/neva-22b', aliases=['ai-neva-22b', 'playground_neva_22b', 'neva_22b'], supports_tools=False, base_model=None),
 Model(id='google/gemma-2-9b-it', model_type='chat', client='ChatNVIDIA', endpoint=None, aliases=['ai-gemma-2-9b-it'], supports_tools=False, base_model=None),
 Model(id='snowflake/arctic', model_type='chat', client='ChatNVIDIA', endpoint=None, aliases=['ai-arctic'], supports_tools=False, base_model=None),
 Model(id='nvidia/llama3-chatqa-1.5-70b', model_type='qa', client='ChatNVIDIA', endpoint=None, aliases=['ai-chatqa-1.5-70b'], supports_tools=False, base_model=None),
 Model(id='microsoft/phi-3-small-128k-instruct', model_type='chat', client='ChatNVIDIA', endpoint=None, 

<h3>
 Hands-on section    
</h3>

In [49]:
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings

embeddingmodels = NVIDIAEmbeddings(model="NV-Embed-QA", truncate="END")

In [50]:
import requests
import json

url = "https://integrate.api.nvidia.com/v1/chat/completions"
headers = {
    "Content-Type": "application/json",
    "Authorization": "Bearer {}".format(os.environ["NVIDIA_API_KEY"])
}
data = {
    "model": "mistralai/mistral-7b-instruct-v0.3",
    "messages": [{"role": "user", "content": "When was Nvidia founded and who is the CEO?"}],
    "temperature": 0.5,
    "top_p": 0.7,
    "max_tokens": 1024,
    "stream": False
}

response = requests.post(url, headers=headers, data=json.dumps(data))

print(response.json()['choices'][0]['message']['content'])


Nvidia was founded on April 5, 1993, by Jensen Huang, Chris Malachowsky, Curtis Priem, and Ray Bingham. As of 2022, the CEO is Jensen Huang, who has been leading the company since co-founding it.


In [51]:
from langchain_nvidia_ai_endpoints import ChatNVIDIA

model = "mistralai/mistral-7b-instruct-v0.3"
llm = ChatNVIDIA(model=model, max_tokens=100)

In [52]:
result = llm.invoke("What is a RAG?")
result

AIMessage(content="In the context of project management, a RAG rating is a quick and simple way to communicate a project's current status using three categories: Red, Amber, Green.\n\n1. Red: This indicates a problem or risk that needs immediate attention. The project is not on target, and corrective action is needed.\n\n2. Amber: This signifies that there are issues, but they are not yet critical. The project is facing challenges, but there are opportunities to", response_metadata={'role': 'assistant', 'content': "In the context of project management, a RAG rating is a quick and simple way to communicate a project's current status using three categories: Red, Amber, Green.\n\n1. Red: This indicates a problem or risk that needs immediate attention. The project is not on target, and corrective action is needed.\n\n2. Amber: This signifies that there are issues, but they are not yet critical. The project is facing challenges, but there are opportunities to", 'token_usage': {'prompt_token

In [53]:
urls = ["https://www.nvidia.com/en-in/glossary/retrieval-augmented-generation/",
       "https://en.wikipedia.org/wiki/Retrieval-augmented_generation",
        "https://docs.nvidia.com/cuda/",
        "https://github.com/NVIDIA/cuda-samples"
        "https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html"
       ]

In [None]:
import re
import requests
from bs4 import BeautifulSoup
from typing import List, Union

def html_document_loader(url: Union[str, bytes]) -> str:
    """
    Loads the HTML content of a document from a given URL and return it's content.

    Args:
        url: The URL of the document.

    Returns:
        The content of the document.

    Raises:
        Exception: If there is an error while making the HTTP request.

    """
    try:
        response = requests.get(url)
        html_content = response.text
    except Exception as e:
        print(f"Failed to load {url} due to exception {e}")
        return ""

    try:
        # Create a Beautiful Soup object to parse html
        soup = BeautifulSoup(html_content, "html.parser")

        # Remove script and style tags
        for script in soup(["script", "style"]):
            script.extract()

        # Get the plain text from the HTML document
        text = soup.get_text()

        # Remove excess whitespace and newlines
        text = re.sub("\s+", " ", text).strip()

        return text
    except Exception as e:
        print(f"Exception {e} while loading document")
        return ""

In [None]:
def create_embeddings(embeddings_model,embedding_path: str = "./embed"):

    embedding_path = "./embed"
    print(f"Storing embeddings to {embedding_path}")

    documents = []
    for url in urls:
        document = html_document_loader(url)
        documents.append(document)


    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=0,
        length_function=len,
    )
    print("Total documents:",len(documents))
    texts = text_splitter.create_documents(documents)
    print("Total texts:",len(texts))
    index_docs(embeddings_model,url, text_splitter, texts, embedding_path,)
    print("Generated embedding successfully")

In [None]:
from typing import List, Union
import os
from langchain.vectorstores import FAISS

def index_docs(embeddings_model, url: Union[str, bytes], splitter, documents: List[str], dest_embed_dir: str) -> None:
    """
    Split the documents into chunks and create embeddings for them.
    
    Args:
        embeddings_model: Model used for creating embeddings.
        url: Source url for the documents.
        splitter: Splitter used to split the documents.
        documents: List of documents whose embeddings need to be created.
        dest_embed_dir: Destination directory for embeddings.
    """
    texts = []
    metadatas = []

    for document in documents:
        chunk_texts = splitter.split_text(document.page_content)
        texts.extend(chunk_texts)
        metadatas.extend([document.metadata] * len(chunk_texts))

    if os.path.exists(dest_embed_dir):
        docsearch = FAISS.load_local(
            folder_path=dest_embed_dir, 
            embeddings=embeddings_model, 
            allow_dangerous_deserialization=True
        )
        docsearch.add_texts(texts, metadatas=metadatas)
    else:
        docsearch = FAISS.from_texts(texts, embedding=embeddings_model, metadatas=metadatas)

    docsearch.save_local(folder_path=dest_embed_dir)

In [None]:
%%time
create_embeddings(embeddings_model=embeddings_model)

In [None]:
embedding_path = "./embed/"
docsearch = FAISS.load_local(folder_path=embedding_path, embeddings=embeddings_model, allow_dangerous_deserialization=True)


In [None]:
summary_llm = ChatNVIDIA(model="meta/llama-3.1-70b-instruct")
chat_llm = ChatNVIDIA(model="meta/llama3-8b-instruct",
                      temperature=0.1,
                      max_tokens=1000,
                      top_p=1.0)

In [None]:
from langchain.chains import (
create_history_aware_retriever,
create_retrieval_chain)

from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory


In [None]:
#Setup Message History format and retriever
contextualize_q_system_prompt = """Given a chat history and the latest user question which might reference context in the chat history, formulate a standalone question which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

#Setup Agent Behaviour
qa_system_prompt = """You are an assistant for question-answering tasks. \
Try to use the following pieces of retrieved context to answer the question. \
Use three sentences maximum and keep the answer concise.\

{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)


In [None]:
history_aware_retriever = create_history_aware_retriever(
    summary_llm, 
    docsearch.as_retriever(), # the vectorstore serves as the junction to retrieve documents with highest similarity to the query.
    contextualize_q_prompt
)
question_answer_chain = create_stuff_documents_chain(chat_llm, qa_prompt)
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [None]:
# sample session creation
store = {}
def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [None]:
conversational_rag_chain.invoke(
    {"input": "What is RAG?"},
     config={
        "configurable": {"session_id": "abc123"}
    }
)["answer"]

In [None]:
out = conversational_rag_chain.invoke(
    {"input": "How is it helpful?"},
     config={
        "configurable": {"session_id": "abc123"}
    }
)["answer"]
print(out)

In [None]:
out = conversational_rag_chain.invoke(
    {"input": "What is the meaning of retrieval?"},
     config={
        "configurable": {"session_id": "abc123"}
    }
)["answer"]
print(out)

In [None]:
out = conversational_rag_chain.invoke(
    {"input": "Can you write a kernel to add the elements of two arrays and store the output in a third array?"},
     config={
        "configurable": {"session_id": "xyz456"}
    }
)["answer"]

print(out)

<h3>Initialising "meta / llama-3.1-8b-instruct" LLM model to the cluster</h3>

In [5]:
! docker ps -a --no-trunc

CONTAINER ID                                                       IMAGE                                                COMMAND                                                       CREATED        STATUS                  PORTS                                         NAMES
4ad7b8ee2bb5f14cde2e3e60e4eafad6e0b4b61a78670deb441a0755c90392df   nvcr.io/nim/nvidia/nv-embedqa-e5-v5:1.0.1            "/opt/nvidia/nvidia_entrypoint.sh /opt/nim/start-server.sh"   46 hours ago   Up 46 hours             0.0.0.0:11022->8000/tcp, :::11022->8000/tcp   embed_nim
6ef6ad73da5540b0dd28c15da4bf2ab25fc02cc7d2432b495bfdb0295bcfb7f8   nvcr.io/nim/nvidia/nv-rerankqa-mistral-4b-v3:1.0.2   "/opt/nvidia/nvidia_entrypoint.sh /opt/nim/start-server.sh"   46 hours ago   Up 46 hours             0.0.0.0:11737->8000/tcp, :::11737->8000/tcp   rerank_nim
a5732a8ccde9ca1fbf867980c23f1dd95424b2072b1f6f77c6f408aa39f48ad5   nvcr.io/nim/meta/llama-3.1-8b-instruct:1.1.2         "/opt/nvidia/nvidia_entrypoint.sh /opt/nim/start-ser

In [4]:
 !docker exec -it a5732a8ccde9 bash

[?2004hI have no name!@a5732a8ccde9:/$ ^C[?2004l
[?2004l
[?2004hI have no name!@a5732a8ccde9:/$ 

In [66]:
! echo -e "$NGC_API_KEY" | docker login nvcr.io --username '$oauthtoken' --password-stdin

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded


In [None]:
! docker pull nvcr.io/nim/meta/llama-3.1-8b-instruct:1.1.2

In [6]:
! docker image ls -a --no-trunc


REPOSITORY                                     TAG       IMAGE ID                                                                  CREATED        SIZE
nvcr.io/nim/meta/llama-3.1-8b-instruct         1.1.2     sha256:c9102c3e95f3acae5b41fdafd7aefd3389bdaf33cf0b3458041e45046bf6bb1d   7 weeks ago    12.9GB
nvcr.io/nim/nvidia/nv-rerankqa-mistral-4b-v3   1.0.2     sha256:47df3d3998eb2f8fcf3da6c8aa55297b34c4789234ea0f2cbf54b4bcd411b37d   2 months ago   15.5GB
nvcr.io/nim/nvidia/nv-embedqa-e5-v5            1.0.1     sha256:fa5c1fc5ccb39cd6d9910ac04fb8bda7ea46e6f82de8d05c2972d1071d5a62c7   2 months ago   15.7GB


In [None]:
!export NGC_API_KEY="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"

In [11]:
from os.path import expanduser
home = expanduser("~")
os.environ['LOCAL_NIM_CACHE']=f"{home}/.cache/nim"
!echo $LOCAL_NIM_CACHE

/home/gsh-3atzc7/.cache/nim


In [None]:
!mkdir -p "$LOCAL_NIM_CACHE"
!chmod 777 "$LOCAL_NIM_CACHE"

In [None]:
import random
import socket

def find_available_port(start=11000, end=11999):
    while True:
        # Randomly select a port between start and end range
        port = random.randint(start, end)
        
        # Try to create a socket and bind to the port
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
            try:
                sock.bind(("localhost", port))
                # If binding is successful, the port is free
                return port
            except OSError:
                # If binding fails, the port is in use, continue to the next iteration
                continue

# Find and print an available port
os.environ['CONTAINER_PORT'] = str(find_available_port())
print(f"Your have been alloted the available port: {os.environ['CONTAINER_PORT']}")

In [None]:
! docker run -it -d --rm \
--gpus 1 \
--name=LLM_nim \
--shm-size=16GB \
-e NGC_API_KEY \
-v $LOCAL_NIM_CACHE:/opt/nim/.cache \
-u $(id -u) \
-p 8000:8000 \
nvcr.io/nim/meta/llama-3.1-8b-instruct:1.1.2
# In order to ensure, the local NIM container is completely loaded and doesn't remain in pending stage, we instantiate a wait interval
! sleep 60

In [None]:
! docker logs --details LLM_nim

In [55]:
from langchain_nvidia_ai_endpoints import ChatNVIDIA

llm = ChatNVIDIA(base_url= "http://0.0.0.0:8000/v1", model="meta/llama-3.1-8b-instruct", temperature=0.1, max_tokens=1000, top_p=1.0)

result = llm.invoke("What is the capital of France?")
print(result.content)

The capital of France is Paris.


In [56]:
!curl -X 'POST' \
    "http://0.0.0.0:8000/v1/completions" \
    -H "accept: application/json" \
    -H "Content-Type: application/json" \
    -d '{"model": "meta/llama-3.1-8b-instruct", "prompt": "What is the capital of France?", "max_tokens": 64}'

{"id":"cmpl-841c3b6fea2d4ec8a3db6ae681e1c24a","object":"text_completion","created":1728051640,"model":"meta/llama-3.1-8b-instruct","choices":[{"index":0,"text":" [Answer]\nThe capital of France is Paris.\nLooking for somewhere to eat in Paris?\nWhy the Eiffel Tower is a must visit.\nPractical information about visiting France. [Answer]\nLanguage : French\nCurrency: Euro\nVisa requirements: Check with your home country's government on the current visa requirements\n","logprobs":null,"finish_reason":"length","stop_reason":null}],"usage":{"prompt_tokens":8,"total_tokens":72,"completion_tokens":64}}

In [None]:
! docker ps


In [None]:
!docker container stop rerank_nim

In [None]:
!docker container stop embed_nim

In [None]:
!docker run nvcr.io/nim/meta/llama-3.1-8b-instruct:1.1.2 list-model-profiles

In [None]:
! docker ps


In [None]:
! docker ls

In [None]:
from os.path import expanduser
home = expanduser("~")
os.environ['LOCAL_NIM_CACHE']=f"{home}/.cache/nim"
!echo $LOCAL_NIM_CACHE

In [None]:
!mkdir -p "$LOCAL_NIM_CACHE"
# !chmod 777 "$LOCAL_NIM_CACHE"

In [None]:
import random
import socket

def find_available_port(start=11000, end=11999):
    while True:
        # Randomly select a port between start and end range
        port = random.randint(start, end)
        
        # Try to create a socket and bind to the port
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
            try:
                sock.bind(("localhost", port))
                # If binding is successful, the port is free
                return port
            except OSError:
                # If binding fails, the port is in use, continue to the next iteration
                continue

# Find and print an available port
os.environ['CONTAINER_PORT'] = str(find_available_port())
print(f"Your have been alloted the available port: {os.environ['CONTAINER_PORT']}")

In [None]:
! docker run -it -d --rm \
--gpus device=1 \
--name=embed_nim \
--shm-size=16GB  \
-e NGC_API_KEY \
-v $LOCAL_NIM_CACHE:/opt/nim/.cache \
-u $(id -u) \
-p $CONTAINER_PORT:8000 \
nvcr.io/nim/nvidia/nv-embedqa-e5-v5:1.0.1

# In order to ensure, the local NIM container is completely loaded and doesn't remain in pending stage, we instantiate a wait interval
! sleep 60

In [None]:
!docker ps


In [None]:
!docker container stop embed_nim

<h3>Deploying embedding and retriever models locally</h3>

In [None]:
! docker pull nvcr.io/nim/nvidia/nv-rerankqa-mistral-4b-v3:1.0.2

In [None]:
from os.path import expanduser
home = expanduser("~")
os.environ['LOCAL_NIM_CACHE']=f"{home}/.cache/nim"
!echo $LOCAL_NIM_CACHE

In [None]:
!mkdir -p "$LOCAL_NIM_CACHE"
# !chmod 777 "$LOCAL_NIM_CACHE"

In [None]:
import random
import socket

def find_available_port(start=11000, end=11999):
    while True:
        # Randomly select a port between start and end range
        port = random.randint(start, end)
        
        # Try to create a socket and bind to the port
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
            try:
                sock.bind(("localhost", port))
                # If binding is successful, the port is free
                return port
            except OSError:
                # If binding fails, the port is in use, continue to the next iteration
                continue

# Find and print an available port
os.environ['CONTAINER_PORT'] = str(find_available_port())
print(f"Your have been alloted the available port: {os.environ['CONTAINER_PORT']}")

In [None]:
! docker run -it -d --rm \
--gpus device=2  \
--name=rerank_nim \
--shm-size=16GB  \
-e NGC_API_KEY \
-v $LOCAL_NIM_CACHE:/opt/nim/.cache \
-u $(id -u) \
-p $CONTAINER_PORT:8000 \
nvcr.io/nim/nvidia/nv-rerankqa-mistral-4b-v3:1.0.2

In [13]:
!docker ps

CONTAINER ID   IMAGE                                                COMMAND                  CREATED       STATUS       PORTS                                         NAMES
4ad7b8ee2bb5   nvcr.io/nim/nvidia/nv-embedqa-e5-v5:1.0.1            "/opt/nvidia/nvidia_…"   7 hours ago   Up 7 hours   0.0.0.0:11022->8000/tcp, :::11022->8000/tcp   embed_nim
6ef6ad73da55   nvcr.io/nim/nvidia/nv-rerankqa-mistral-4b-v3:1.0.2   "/opt/nvidia/nvidia_…"   7 hours ago   Up 7 hours   0.0.0.0:11737->8000/tcp, :::11737->8000/tcp   rerank_nim
a5732a8ccde9   nvcr.io/nim/meta/llama-3.1-8b-instruct:1.1.2         "/opt/nvidia/nvidia_…"   8 hours ago   Up 8 hours   0.0.0.0:8000->8000/tcp, :::8000->8000/tcp     LLM_nim


In [57]:
urls = ["https://www.nvidia.com/en-in/glossary/retrieval-augmented-generation/",
       "https://en.wikipedia.org/wiki/Retrieval-augmented_generation",
        "https://docs.nvidia.com/cuda/",
        "https://github.com/NVIDIA/cuda-samples"
        "https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html"
       ]

In [58]:
import re
import requests
from bs4 import BeautifulSoup
from typing import List, Union

def html_document_loader(url: Union[str, bytes]) -> str:
    """
    Loads the HTML content of a document from a given URL and return it's content.

    Args:
        url: The URL of the document.

    Returns:
        The content of the document.

    Raises:
        Exception: If there is an error while making the HTTP request.

    """
    try:
        response = requests.get(url)
        html_content = response.text
    except Exception as e:
        print(f"Failed to load {url} due to exception {e}")
        return ""

    try:
        # Create a Beautiful Soup object to parse html
        soup = BeautifulSoup(html_content, "html.parser")

        # Remove script and style tags
        for script in soup(["script", "style"]):
            script.extract()

        # Get the plain text from the HTML document
        text = soup.get_text()

        # Remove excess whitespace and newlines
        text = re.sub("\s+", " ", text).strip()

        return text
    except Exception as e:
        print(f"Exception {e} while loading document")
        return ""

In [59]:
def create_embeddings(embeddings_model,embedding_path: str = "./embed"):

    embedding_path = "./embed"
    print(f"Storing embeddings to {embedding_path}")

    documents = []
    for url in urls:
        document = html_document_loader(url)
        documents.append(document)


    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=0,
        length_function=len,
    )
    print("Total documents:",len(documents))
    texts = text_splitter.create_documents(documents)
    print("Total texts:",len(texts))
    index_docs(embeddings_model,url, text_splitter, texts, embedding_path,)
    print("Generated embedding successfully")

In [14]:
embeddings_model = NVIDIAEmbeddings(base_url="http://0.0.0.0:11022/v1", model='nvidia/nv-embedqa-e5-v5')

<h3>Faiss vector store through langchain</h3>

In [15]:
from typing import List, Union
import os
from langchain.vectorstores import FAISS
# model_kwargs = {"device": "cuda"}

def index_docs(embeddings_model, url: Union[str, bytes], splitter, documents: List[str], dest_embed_dir: str) -> None:
    """
    Split the documents into chunks and create embeddings for them.
    
    Args:
        embeddings_model: Model used for creating embeddings.
        url: Source url for the documents.
        splitter: Splitter used to split the documents.
        documents: List of documents whose embeddings need to be created.
        dest_embed_dir: Destination directory for embeddings.
    """
    texts = []
    metadatas = []

    for document in documents:
        chunk_texts = splitter.split_text(document.page_content)
        texts.extend(chunk_texts)
        metadatas.extend([document.metadata] * len(chunk_texts))

    if os.path.exists(dest_embed_dir):
        docsearch = FAISS.load_local(
            folder_path=dest_embed_dir, 
            embeddings=embeddings_model, 
            allow_dangerous_deserialization=True
        )
        docsearch.add_texts(texts, metadatas=metadatas)
    else:
        docsearch = FAISS.from_texts(texts, embedding=embeddings_model, metadatas=metadatas,)

    docsearch.save_local(folder_path=dest_embed_dir)

In [16]:
%%time
create_embeddings(embeddings_model=embeddings_model)

Storing embeddings to ./embed
Processing PDF: CUDA_C_Programming_Guide.pdf
Total documents: 6
Total texts: 3026
Generated embedding successfully
CPU times: user 2.67 s, sys: 181 ms, total: 2.85 s
Wall time: 10.4 s


In [17]:
# load Embed documents
embedding_path = "./embed/"
docsearch = FAISS.load_local(folder_path=embedding_path, embeddings=embeddings_model, allow_dangerous_deserialization=True)

<h3>HTML and pdf embedding </h3>

In [14]:
import os
import re
import requests
import pymupdf  # Explicitly using pymupdf instead of fitz
from bs4 import BeautifulSoup
from typing import List, Union
from langchain.vectorstores import FAISS
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# List of URLs
urls = [
    "https://www.nvidia.com/en-in/glossary/retrieval-augmented-generation/",
    "https://en.wikipedia.org/wiki/Retrieval-augmented_generation",
    "https://docs.nvidia.com/cuda/",
    "https://github.com/NVIDIA/cuda-samples",
    "https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html",
    "https://developer.nvidia.com/blog/efficient-cuda-debugging-using-compute-sanitizer-with-nvtx-and-creating-custom-tools/",
    "https://developer.nvidia.com/blog/debugging-cuda-more-efficiently-with-nvidia-compute-sanitizer/",
    "https://developer.nvidia.com/blog/multi-gpu-programming-with-standard-parallel-c-part-1/",
    "https://developer.nvidia.com/blog/multi-gpu-programming-with-standard-parallel-c-part-2",
    "https://developer.nvidia.com/blog/efficient-cuda-debugging-memory-initialization-and-thread-synchronization-with-nvidia-compute-sanitizer/",
    "https://cuda-tutorial.readthedocs.io/en/latest/tutorials/tutorial02/#introduction",
    "https://developer.nvidia.com/blog/even-easier-introduction-cuda/",
    "https://github.com/jkonvicka/Nvidia-CUDA-course/blob/main/AC_CUDA_C.md",
    "https://github.com/jkonvicka/Nvidia-CUDA-course/blob/main/cuda_cheatsheet.md",
    "https://github.com/jkonvicka/Nvidia-CUDA-course/blob/main/Unified%20Memory.md",
    "https://github.com/jkonvicka/Nvidia-CUDA-course/blob/main/Streaming%20and%20Visual%20Profiling.md",
    "https://github.com/jkonvicka/Nvidia-CUDA-course/blob/main/AC_CUDA_C.md",
    "https://tbetcke.github.io/hpc_lecture_notes/intro.html",
    
]

# Directory containing PDF files
pdf_dir = '/mnt/lustre/hackathons/hack_teams/hack_team_16/workspace/Jishnu/Pdf'

# Directory containing other file types (e.g., .txt, .docx)
other_files_dir = '/mnt/lustre/hackathons/hack_teams/hack_team_16/workspace/Jishnu/text'

# Function to load HTML content from a URL
def html_document_loader(url: Union[str, bytes]) -> str:
    try:
        response = requests.get(url)
        html_content = response.text
    except Exception as e:
        print(f"Failed to load {url} due to exception {e}")
        return ""

    try:
        soup = BeautifulSoup(html_content, "html.parser")
        # Remove script and style tags
        for script in soup(["script", "style"]):
            script.extract()
        text = soup.get_text()
        text = re.sub(r"\s+", " ", text).strip()
        return text if isinstance(text, str) else ""
    except Exception as e:
        print(f"Exception {e} while loading document")
        return ""

# Function to extract text from PDFs using PyMuPDF
def extract_text_from_pdfs(pdf_directory: str) -> List[dict]:
    pdf_texts = []
    
    for pdf_file in os.listdir(pdf_directory):
        if pdf_file.lower().endswith('.pdf'):
            pdf_path = os.path.join(pdf_directory, pdf_file)
            print(f"Processing PDF: {pdf_file}")

            # Open the PDF file
            with pymupdf.open(pdf_path) as doc:
                extracted_text = ""
                for page_num in range(len(doc)):
                    page = doc.load_page(page_num)
                    extracted_text += page.get_text()
            
            # Ensure extracted text is a string
            extracted_text = extracted_text if isinstance(extracted_text, str) else ""
            
            # Append extracted text and metadata
            pdf_texts.append({
                'page_content': extracted_text,
                'metadata': {'type': 'pdf', 'source': pdf_file}
            })

    return pdf_texts

# Function to handle other file types using UnstructuredFileLoader
def extract_text_from_other_files(file_directory: str) -> List[dict]:
    other_texts = []
    
    for file_name in os.listdir(file_directory):
        file_path = os.path.join(file_directory, file_name)
        # Use UnstructuredFileLoader for other file types
        loader = UnstructuredFileLoader(file_path)
        try:
            loaded_documents = loader.load()
            for doc in loaded_documents:
                other_texts.append({
                    'page_content': doc.page_content,
                    'metadata': {'type': 'file', 'source': file_name}
                })
        except Exception as e:
            print(f"Failed to load file {file_name} due to exception: {e}")
    
    return other_texts

# Function to index documents using embeddings
def index_docs(embeddings_model, url: Union[str, bytes], splitter, documents: List[object], dest_embed_dir: str) -> None:
    texts = []
    metadatas = []

    for document in documents:
        # Access the content and metadata of each Document object
        page_content = document.page_content
        metadata = document.metadata
        
        # Ensure document content is a string
        if not isinstance(page_content, str):
            print(f"Skipping document due to invalid content: {document}")
            continue
        
        chunk_texts = splitter.split_text(page_content)
        texts.extend(chunk_texts)
        metadatas.extend([metadata] * len(chunk_texts))

    # Load existing FAISS index or create a new one if it doesn't exist
    if os.path.exists(dest_embed_dir):
        docsearch = FAISS.load_local(
            folder_path=dest_embed_dir, 
            embeddings=embeddings_model, 
            allow_dangerous_deserialization=True
        )
        docsearch.add_texts(texts, metadatas=metadatas)
    else:
        docsearch = FAISS.from_texts(texts, embedding=embeddings_model, metadatas=metadatas)

    docsearch.save_local(folder_path=dest_embed_dir)

# Function to process HTML, PDF, and other files and create embeddings
def create_embeddings(embeddings_model, embedding_path: str = "./embed"):
    print(f"Storing embeddings to {embedding_path}")

    documents = []
    total_html_docs = 0  # Variable to keep track of HTML document count

    # Load HTML content from URLs
    for url in urls:
        document_content = html_document_loader(url)
        if document_content:
            total_html_docs += 1
            documents.append({'page_content': document_content, 'metadata': {'type': 'html', 'source': url}})

    # Extract text from PDFs
    pdf_documents = extract_text_from_pdfs(pdf_dir)
    total_pdf_docs = len(pdf_documents)  # Count PDF documents
    documents.extend(pdf_documents)

    # Check if the directory for other files exists and is not empty
    if os.path.exists(other_files_dir) and os.listdir(other_files_dir):
        other_files_documents = extract_text_from_other_files(other_files_dir)
        documents.extend(other_files_documents)

    # Extract the text content for splitting
    document_texts = [doc['page_content'] for doc in documents if isinstance(doc.get('page_content', ""), str)]

    # Split text into chunks and create embeddings
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=0,
        length_function=len,
    )
    print(f"Total HTML documents: {total_html_docs}")
    print(f"Total PDF documents: {total_pdf_docs}")
    print(f"Total other documents: {len(documents) - total_html_docs - total_pdf_docs}")
    print(f"Total documents: {len(document_texts)}")

    texts = text_splitter.create_documents(document_texts)  # Pass the list of strings for splitting
    print(f"Total texts (chunks): {len(texts)}")

    index_docs(embeddings_model, urls, text_splitter, texts, embedding_path)
    print("Generated embedding successfully")
def create_embeddings(embeddings_model, embedding_path: str = "./embed"):
    print(f"Storing embeddings to {embedding_path}")

    documents = []
    total_html_docs = 0  # Variable to keep track of HTML document count

    # Load HTML content from URLs
    for url in urls:
        document_content = html_document_loader(url)
        if document_content:
            total_html_docs += 1
            documents.append({'page_content': document_content, 'metadata': {'type': 'html', 'source': url}})

    # Extract text from PDFs
    pdf_documents = extract_text_from_pdfs(pdf_dir)
    total_pdf_docs = len(pdf_documents)  # Count PDF documents
    documents.extend(pdf_documents)

    # Check if the directory for other files exists and is not empty
    if os.path.exists(other_files_dir) and os.listdir(other_files_dir):
        other_files_documents = extract_text_from_other_files(other_files_dir)
        documents.extend(other_files_documents)

    # Extract the text content for splitting
    document_texts = [doc['page_content'] for doc in documents if isinstance(doc.get('page_content', ""), str)]

    # Split text into chunks and create embeddings
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=0,
        length_function=len,
    )
    print(f"Total HTML documents: {total_html_docs}")
    print(f"Total PDF documents: {total_pdf_docs}")
    print(f"Total other documents: {len(documents) - total_html_docs - total_pdf_docs}")
    print(f"Total documents: {len(document_texts)}")

    texts = text_splitter.create_documents(document_texts)  # Pass the list of strings for splitting
    print(f"Total texts (chunks): {len(texts)}")

    index_docs(embeddings_model, urls, text_splitter, texts, embedding_path)
    print("Generated embedding successfully")


# Example usage



In [15]:
%%time
embeddings_model = NVIDIAEmbeddings(base_url="http://0.0.0.0:11022/v1", model='nvidia/nv-embedqa-e5-v5')
create_embeddings(embeddings_model=embeddings_model)

Storing embeddings to ./embed
Processing PDF: 9781788996242_ColorImages.pdf
Processing PDF: CUDA_C_Programming_Guide.pdf
Processing PDF: [2013] The_CUDA_Handbook.pdf
Processing PDF: sc11-cuda-c-basics.pdf
Processing PDF: openacc-guide.pdf
Processing PDF: CUDA_by_Example.pdf
Total HTML documents: 18
Total PDF documents: 6
Total other documents: 0
Total documents: 24
Total texts (chunks): 6915
Generated embedding successfully
CPU times: user 7.08 s, sys: 1.2 s, total: 8.29 s
Wall time: 33.4 s


In [46]:
!docker ps -s


CONTAINER ID   IMAGE                                                COMMAND                  CREATED        STATUS        PORTS                                         NAMES        SIZE
4ad7b8ee2bb5   nvcr.io/nim/nvidia/nv-embedqa-e5-v5:1.0.1            "/opt/nvidia/nvidia_…"   21 hours ago   Up 21 hours   0.0.0.0:11022->8000/tcp, :::11022->8000/tcp   embed_nim    995kB (virtual 15.7GB)
6ef6ad73da55   nvcr.io/nim/nvidia/nv-rerankqa-mistral-4b-v3:1.0.2   "/opt/nvidia/nvidia_…"   21 hours ago   Up 21 hours   0.0.0.0:11737->8000/tcp, :::11737->8000/tcp   rerank_nim   1.91MB (virtual 15.5GB)
a5732a8ccde9   nvcr.io/nim/meta/llama-3.1-8b-instruct:1.1.2         "/opt/nvidia/nvidia_…"   22 hours ago   Up 22 hours   0.0.0.0:8000->8000/tcp, :::8000->8000/tcp     LLM_nim      9.03MB (virtual 12.9GB)


In [40]:
!docker rmi 3cb29b0d79e6

Untagged: nvcr.io/nim/meta/llama3-8b-instruct:1.0.0
Untagged: nvcr.io/nim/meta/llama3-8b-instruct@sha256:7fe6071923b547edd9fba87c891a362ea0b4a88794b8a422d63127e54caa6ef7
Deleted: sha256:3cb29b0d79e6d84ca4fd132aa66b652408e8c0fb88177cfee06a55418fdc3de2
Deleted: sha256:d363b236bc57a7c2d43748a0272361d9c93a1a830b1b65729d48070734919bf4
Deleted: sha256:0afef4f6b6a5eae810e2dde7db5d571d9ac4ac120f1ab8938b6d25ba923d85b4
Deleted: sha256:113fe49b1d137ede04be15d2eca24351c065822d01a5a6c65410a3b37660784e
Deleted: sha256:90cc92b781e2986bdfff11a03227a7133a237d376e6ae3ada86d28cf710effac
Deleted: sha256:c94eb581f0fdd36f8419033ec5c3791925899c00c75638d88ce8638e959df5c6
Deleted: sha256:419a712e152882383be969cd316bbb38cb4e1d8191e346a2173cf79210e959a1
Deleted: sha256:9c94d94a89cb580e6f0a302668e3330e2dc3461f5db6048bca23f7716735c89b
Deleted: sha256:3ae37cc3ebe01ff609d1829e67fcf8b1d9d51696e49e00d4ab154e1c585a007d
Deleted: sha256:0e876143f4822a81e5fa5d0f89f8e0c13f3c5f5176dd9f51835a445292bacbe8
Deleted: sha256:57a6

In [16]:
embedding_path = "./embed/"
docsearch = FAISS.load_local(folder_path=embedding_path, embeddings=embeddings_model, allow_dangerous_deserialization=True)

In [25]:
llm = ChatNVIDIA(base_url= "http://0.0.0.0:8000/v1", model="meta/llama-3.1-8b-instruct", temperature=0.1, max_tokens=1000, top_p=1.0)

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

qa_prompt=QA_PROMPT

doc_chain = load_qa_chain(llm, chain_type="stuff", prompt=QA_PROMPT)

qa = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=docsearch.as_retriever(),
    chain_type="stuff",
    memory=memory,
    combine_docs_chain_kwargs={'prompt': qa_prompt},
)

In [26]:
query = "What are the salient features of CUDA"
result = qa({"question": query})
print(result.get("answer"))

It seems like there's a bit of repetition in the text! Based on the provided context, I'll summarize the salient features of CUDA:

1. **No knowledge of OpenGL or DirectX required**: Users don't need to have expertise in these graphics programming interfaces to use CUDA.
2. **No need to force problems to look like computer graphics tasks**: CUDA allows users to focus on their specific problem domain without having to adapt it to a graphics-related framework.
3. **Orders-of-magnitude performance improvement**: CUDA can lead to significant performance enhancements over traditional approaches.

Let me know if you'd like me to clarify or expand on these points!


In [27]:
query = "What is RAG?"
result = qa({"question": query})
print(result.get("answer"))

I don't have the text to answer the question. It looks like the text is repeated multiple times, but it doesn't provide any content.


In [28]:
query = "how to do profiling?"
result = qa({"question": query})
print(result.get("answer"))

The text doesn't provide a direct answer to the question of how to profile CUDA applications, but it does mention that "See the Application Note on CUDA for Tegra for details." This suggests that the Application Note on CUDA for Tegra contains information on how to profile CUDA applications.

So, my answer is: See the Application Note on CUDA for Tegra for details.


In [30]:
query = "Can I read double precision floats from texture?"
result = qa({"question": query})
print(result.get("answer"))

Unfortunately, the provided text does not explicitly state whether you can read double precision floats from a texture or not. It discusses interpolation and precision in the context of a microdemo, but it does not provide a clear answer to the question.


In [31]:
!pip install nemoguardrails



In [33]:
from nemoguardrails import LLMRails, RailsConfig

config = RailsConfig.from_path("/mnt/lustre/hackathons/hack_teams/hack_team_16/workspace/config.yaml")
rails = LLMRails(config)

ValidationError: 1 validation error for RailsConfig
models
  Field required [type=missing, input_value={'instructions': [{'type'.../workspace/config.yaml'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.8/v/missing

In [34]:
!docker inspect LLM_nim

[
    {
        "Id": "a5732a8ccde9ca1fbf867980c23f1dd95424b2072b1f6f77c6f408aa39f48ad5",
        "Created": "2024-10-04T07:31:05.165531824Z",
        "Path": "/opt/nvidia/nvidia_entrypoint.sh",
        "Args": [
            "/opt/nim/start-server.sh"
        ],
        "State": {
            "Status": "running",
            "Running": true,
            "Paused": false,
            "Restarting": false,
            "OOMKilled": false,
            "Dead": false,
            "Pid": 586536,
            "ExitCode": 0,
            "Error": "",
            "StartedAt": "2024-10-04T07:31:05.613158479Z",
            "FinishedAt": "0001-01-01T00:00:00Z"
        },
        "Image": "sha256:c9102c3e95f3acae5b41fdafd7aefd3389bdaf33cf0b3458041e45046bf6bb1d",
        "ResolvConfPath": "/raid/tmp/docker-container-storage-2072/containers/a5732a8ccde9ca1fbf867980c23f1dd95424b2072b1f6f77c6f408aa39f48ad5/resolv.conf",
        "HostnamePath": "/raid/tmp/docker-container-storage-2072/containers/a5732a8ccde9

In [41]:
!docker ps


CONTAINER ID   IMAGE                                                COMMAND                  CREATED      STATUS      PORTS                                         NAMES
4ad7b8ee2bb5   nvcr.io/nim/nvidia/nv-embedqa-e5-v5:1.0.1            "/opt/nvidia/nvidia_…"   4 days ago   Up 4 days   0.0.0.0:11022->8000/tcp, :::11022->8000/tcp   embed_nim
6ef6ad73da55   nvcr.io/nim/nvidia/nv-rerankqa-mistral-4b-v3:1.0.2   "/opt/nvidia/nvidia_…"   4 days ago   Up 4 days   0.0.0.0:11737->8000/tcp, :::11737->8000/tcp   rerank_nim
a5732a8ccde9   nvcr.io/nim/meta/llama-3.1-8b-instruct:1.1.2         "/opt/nvidia/nvidia_…"   4 days ago   Up 4 days   0.0.0.0:8000->8000/tcp, :::8000->8000/tcp     LLM_nim


In [38]:
retriever=docsearch.as_retriever()

In [42]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain_nvidia_ai_endpoints import NVIDIARerank

# Re-initialize and connect to a NeMo Retriever Text Reranking NIM running at localhost:8000
compressor = NVIDIARerank(model="nvidia/nv-rerankqa-mistral-4b-v3",
                          base_url="http://localhost:11737/v1")

compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=retriever
)

In [44]:

from langchain.chains import RetrievalQA

query = "an I read double precision floats from texture?"

chain = RetrievalQA.from_chain_type(llm=llm, retriever=compression_retriever)
chain.invoke(query)

{'query': 'an I read double precision floats from texture?',
 'result': 'No, the provided documentation does not mention support for reading double precision floats from texture. The supported array formats for textures include:\n\n* unsigned char (CU_AD_FORMAT_UNSIGNED_INT8)\n* unsigned short (CU_AD_FORMAT_UNSIGNED_INT16)\n* unsigned int (CU_AD_FORMAT_UNSIGNED_INT32)\n* signed char (CU_AD_FORMAT_SIGNED_INT8)\n* short (CU_AD_FORMAT_SIGNED_INT16)\n* int (CU_AD_FORMAT_SIGNED_INT32)\n* half (IEEE 754 “binary16” format) (CU_AD_FORMAT_SIGNED_HALF)\n* signed float (CU_AD_FORMAT_SIGNED_FLOAT)\n\nThere is no mention of double precision float support.'}

In [47]:

from langchain.chains import RetrievalQA

query = "how to do
chain = RetrievalQA.from_chain_type(llm=llm, retriever=compression_retriever)
chain.invoke(query)

{'query': 'Question: Consider the following code snippet executing on a GPU architecture where i) width of L1 cache line is 64 bytesii) width of L2 cache line is 16 bytesiii) warp size is 16__global__ void mem_access(float *out, float *in){int tidx = blockIdx.x*blockDim.x + threadIdx.x;out[tidx] = in[tidx*4];}Assume that the number of threads being launched is 512. The size of the in array is 2048and the size of the out array is 512. What are the total number of L1 writes to L2 cache?',
 'result': "To determine the total number of L1 writes to L2 cache, we need to understand how memory accesses are handled in the given code.\n\nThe kernel function `mem_access` is launched with 512 threads, and the out array has a size of 512. This means that each thread writes to a unique location in the out array.\n\nThe in array has a size of 2048, which is larger than the out array. The kernel function accesses the in array with a stride of 4, which means that each thread accesses 4 consecutive elem