# Build Custom Chatbot With PDF Documents

In [1]:
%pwd

'd:\\AI_Projects\\LLM-Based-Medical-Question-Answering-System\\research'

In [2]:
import os
os.chdir("../")

In [3]:
%pwd

'd:\\AI_Projects\\LLM-Based-Medical-Question-Answering-System'

## Install Libraries

In [5]:
!pip install -r requirements.txt

Obtaining file:///D:/AI_Projects/LLM-Based-Medical-Question-Answering-System (from -r requirements.txt (line 13))
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting langchain-community (from -r requirements.txt (line 4))
  Downloading langchain_community-0.2.19-py3-none-any.whl.metadata (2.7 kB)
Collecting pyvis (from -r requirements.txt (line 9))
  Downloading pyvis-0.3.2-py3-none-any.whl.metadata (1.7 kB)
Collecting einops (from -r requirements.txt (line 10))
  Downloading einops-0.8.1-py3-none-any.whl.metadata (13 kB)
Collecting langsmith<0.1.0,>=0.0.11 (from langchain==0.0.255->-r requirements.txt (line 3))
  Using cached langsmith-0.0.92-py3-none-any.whl.metadata (9.9 kB)
INFO: pip is looking at multiple versions of langchain-community to determine which version is compatible with other requirements. This could take a while.
Collecting langchain-community (from -r requirements.txt (line 4))
  Downloading langchain_commu

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-pinecone 0.1.3 requires langchain-core<0.3,>=0.1.52, but you have langchain-core 0.1.23 which is incompatible.


In [None]:
## [Optional] Code for standard output
# import os, logging, sys

# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [4]:
from dotenv import load_dotenv
load_dotenv()

Llama_1B_ACCESS_TOKEN = os.environ.get('Llama_ACCESS_TOKEN')

In [5]:
import os
from huggingface_hub import login

os.environ["HF_KEY"] = Llama_1B_ACCESS_TOKEN
login(token=os.environ.get('HF_KEY'),add_to_git_credential=True)

  from .autonotebook import tqdm as notebook_tqdm


## Load PDF document

In [6]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(input_dir="./data", required_exts=".pdf").load_data()
len(documents)

637

## Initialize Embedding Model

In [7]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL_NAME, embed_batch_size=8)

In [8]:
from llama_index.core import Settings

Settings.embed_model = embed_model
Settings.chunk_size = 512
Settings.chunk_overlap = 30

## Prompt Define

In [9]:
#from llama_index.core.prompts.prompts import PromptTemplate
from llama_index.core import PromptTemplate

system_prompt = """<|SYSTEM|># You are a helpful AI assistant for answering questions.
Your goal is to provide accurate and concise responses based on the context provided.
If you're unsure or lack information, please say I don't know about it.
Feel free to use polite greetings to engage with the user.
"""

# This will wrap the default prompts that are internal to llama-index
query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")

## Initialize Model

In [10]:
import torch
from llama_index.llms.huggingface import HuggingFaceLLM

LLM_MODEL_NAME = "meta-llama/Llama-3.2-1B"

# To import models from HuggingFace directly
llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=512,
    generate_kwargs={"temperature": 0.7,"do_sample":False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name=LLM_MODEL_NAME,
    model_name=LLM_MODEL_NAME,
    device_map="auto",
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.float16 , "load_in_8bit":True}
)

Settings.llm = llm

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


## Vector RAG

Note: Here both 2 type is included

* Create instant index and query form here
* using pinecone to store vector index in pinecone cloud store

In [11]:
from llama_index.core import VectorStoreIndex

index = VectorStoreIndex.from_documents(documents, embed_model = embed_model)

Initialize Query engine

In [12]:
query_engine = index.as_query_engine(llm=llm, similarity_top_k=3)

Formatting output

In [13]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

Result generation with RAG

In [16]:
done = False
while not done:
  print("*"*30)
  question = input("Enter your question: ")
  response = query_engine.query(question)
  print("User Query: "+ question)
  print("ChatBot Response: ")
  print(response)
  #print(response.get_formatted_sources())
  done = input("End the chat? (y/n): ") == "y"

******************************


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


User Query: What can I do to remove acne?
ChatBot Response: 
 can help you remove acne by using
the following steps:
1. Wash the face with mild soap and water only two or
three times a day, unless the physician says to wash it
more often.
2. Avoid using abrasive soaps or cleansers and products
that might dry the skin or make it peel, such as medicated
cosmetics, cleansers that contain alcohol, or other acne
products that contain resorcinol, sulfur or salicylic acid.
3. If benzoyl peroxide or tretinoin make the skin too red
or too dry or cause too much peeling, check with a
physician. Using the medicine less often or using a
weaker strength may be necessary.
4. Tretinoin may increase sensitivity to sunlight. While
being treated with this medicine, avoid exposure to the
sun and do not use tanning beds, tanning booths, or sunlamps.
If it is not possible to avoid being in the sun, use a sun-
screen with a skin protection factor (SPF) of at least 15 or
wear protective clothing over the trea

## Store Vector RAG to Pinecone 

In [8]:
from dotenv import load_dotenv
load_dotenv()

PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')

In [9]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

In [11]:
index_name = "medical-data"

pc.create_index(
    name=index_name,
    dimension=384, 
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

Storing data to Pinecone

In [12]:
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [None]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=documents,
    index_name=index_name,
    embedding=embed_model, 
)

In [None]:
# Load Existing index 

from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embed_model
)

In [None]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [None]:
retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs

## Graph RAG

Note: This code part works properly for small dataset (tested with a different 2 page doc. But In this project case, there is lacking in my computational power)

In [17]:
from llama_index.core.storage.storage_context import StorageContext
from llama_index.core import SimpleDirectoryReader, KnowledgeGraphIndex
from llama_index.core.graph_stores import SimpleGraphStore

#setup the storage context
graph_store = SimpleGraphStore()
storage_context = StorageContext.from_defaults(graph_store=graph_store)

#include_embeddings: to determine whether to include vector embeddings to enhance the search capabilities
index_graph = KnowledgeGraphIndex.from_documents(documents=documents,
                                           max_triplets_per_chunk=3,
                                           storage_context=storage_context,
                                           embed_model=embed_model,
                                          include_embeddings=True)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

KeyboardInterrupt: 

In [None]:
from pyvis.network import Network
g = index_graph.get_networkx_graph()
net = Network(notebook=True, cdn_resources="in_line", directed=True)
net.from_nx(g)
net.save_graph("rag_graph.html")

In [None]:
from IPython.display import HTML, display
HTML(filename="rag_graph.html")

In [None]:
query_engine = index_graph.as_query_engine(llm=llm, similarity_top_k=3)

In [None]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
done = False
while not done:
  print("*"*30)
  question = input("Enter your question: ")
  response = query_engine.query(question)
  print(response)
  #print(response.get_formatted_sources())
  done = input("End the chat? (y/n): ") == "y"