In [3]:
# Install necessary libraries (run this command in your terminal or IDE's console)
# pip install PdfReader langchain PyPDF2 InstructorEmbedding sentence_transformers faiss-cpu

# Import required libraries
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings, HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceHub
import os

# Set the Hugging Face API token
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "<YOUR_HUGGINGFACE_API_TOKEN>"


# Function to extract text from PDF files
def get_pdf_text(pdf_docs):
    text = ""
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text


# Function to create overlapping text chunks
def get_text_chunks(text):
    text_splitter = CharacterTextSplitter(
        separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len
    )
    chunks = text_splitter.split_text(text)
    return chunks


# Function to create embeddings for chunks of text
def get_vectorstore(text_chunks):
    embeddings = HuggingFaceEmbeddings(model_name="hkunlp/instructor-xl")
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    return vectorstore


# Function to create a retrieval LLM chain
def retrieval_qa_chain(db, return_source_documents):
    llm = HuggingFaceHub(
        repo_id="tiiuae/falcon-7b-instruct",
        model_kwargs={"temperature": 0.6, "max_length": 500, "max_new_tokens": 700},
    )
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=db,
        return_source_documents=return_source_documents,
    )
    return qa_chain


# Specify the path to your PDF file
path_to_pdf = ["./data_set/new_diseases_data.pdf"]  # Adjust path as needed

# Process PDF and prepare text chunks
raw_text = "THis is a test text"
text_chunks = get_text_chunks(raw_text)

# Create vector store and retrieval database
vectorstore = get_vectorstore(text_chunks)
db = vectorstore.as_retriever(search_kwargs={"k": 3})

# Initialize the question-answering bot
bot = retrieval_qa_chain(db, True)

# Example query
query = "what is Nampdicta?"
sol = bot(query)

# Output the results
print("Answer:", sol["result"])
print("Source Documents:", sol["source_documents"])

# Example questions to test the bot
ques = [
    "what are the origins of Numpalofich Legatrosis",
    "what are the stages of diseases progression in Ramtronephiach Oculosis",
    "what is mortality rate in Wallmic Pulmora",
    "is Numpalactics incubation period short?",
    "what is Numpalactic",
    "What are the symptoms of a disease that causes blindness?",
    "what are the origins of Ramtronephiach Oculosis",
]

# Iterate through questions and print responses
for question in ques:
    sol = bot(question)
    print("Question:", question)
    print("Answer:", sol["result"])

  embeddings = HuggingFaceEmbeddings(model_name="hkunlp/instructor-xl")


ChunkedEncodingError: ('Connection broken: IncompleteRead(1058977994 bytes read, 3904727025 more expected)', IncompleteRead(1058977994 bytes read, 3904727025 more expected))