<a href="https://colab.research.google.com/github/Ishikaaa/PDF-extraction/blob/main/PDF_extraction_falcon_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install libraries
!pip install PyPDF2
!pip install langchain
!pip install InstructorEmbedding
!pip install sentence-transformers==2.2.2
!pip install faiss-gpu
!pip install -U langchain-community

In [None]:
# import libraries
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain_community.vectorstores import FAISS

from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceHub
import re
import os
import time

In [None]:
# Step-1: get text from single OR multiple PDFs
def get_pdf_text(pdf_docs):
    """
    args:
        pdf_docs: list of pdfs
    """
    text = ""

    # iterate through all pdfs
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        # iterate through all pages
        for page in pdf_reader.pages:
            text += page.extract_text()

    return text


# Step-2: get the text chunks
def get_text_chunks(text):
    """
    : return
        a list of chunks of text that we will feed to our model
    """

    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
    )
    chunks = text_splitter.split_text(text)
    return chunks


# Step-3: Text Embedding and Create Vector store
def get_vectorstore(text_chunks):
    embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    return embeddings, vectorstore


# Step-4: Train the model
def retrieval_qa_chain(db, return_source_documents):
    llm = HuggingFaceHub(repo_id="tiiuae/falcon-7b-instruct", model_kwargs={"temperature": 0.2, "max_length": 500, "max_new_tokens": 700})
    qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                           chain_type='stuff',
                                           retriever=db,
                                           return_source_documents=return_source_documents,
                                           )
    return qa_chain


In [None]:
if __name__ == "__main__":
    os.environ["HUGGINGFACEHUB_API_TOKEN"] = "<Hugging_face_API_token>"

    ## Step-1 Load PDF
    pdf_docs = ["HPOODataSheet.pdf"]
    raw_text = get_pdf_text(pdf_docs)

    ## Step-2: get the text chunks
    text_chunks = get_text_chunks(clean_text)

    ## Step-3
    embeddings, vector_store = get_vectorstore(text_chunks)

    ## Step-4
    db = vector_store.as_retriever(search_kwargs={'k': 3})
    bot = retrieval_qa_chain(db, True)

In [None]:
## Step-5
while True:
    query = input("Please enter your response (type 'quit' to exit): ")
    if query.lower() == 'quit':
        print("Goodbye!")
        break
    else:
        start_time = time.time()
        sol = bot(query)
        end_time = time.time()
        answer = sol["result"].split('\nHelpful Answer:')[-1].strip()
        print("Question: ", query)
        print("Answer: ", answer)
        print("Time: ", end_time - start_time)
        print("**************************")

Please enter your response (type 'quit' to exit): What are the Networking of HP OO?
Question:  What are the Networking of HP OO?
Answer:  HP OO is a suite of products that provides a unified platform for managing and automating IT operations. It includes HP OO Studio, HP OO Operations Orchestration, HP OO Service Desk, HP OO Service Level Management, HP OO Change Management, HP OO Incident Management, HP OO Service Management, HP OO Service Automation, HP OO Service Orchestration, HP OO Service Management, HP OO Service Desk, HP OO Service Level Management, HP OO Change Management, HP OO Incident Management, HP OO Service Management, HP OO Service Orchestration, HP OO Service Automation, HP OO Service Management, HP OO Service Desk, HP OO Service Level Management, HP OO Service Management, HP OO Service Orchestration, HP OO Service Automation, HP OO Service Orchestration, HP OO Service Management, HP OO Service Desk, HP OO Service Level Management, HP OO Service Management, HP OO Servi