<a href="https://colab.research.google.com/github/Ishikaaa/PDF-extraction/blob/main/PDF_extraction_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install PyPDF2
!pip install langchain
!pip install InstructorEmbedding
!pip install sentence-transformers==2.2.2
!pip install faiss-gpu
!pip install -U langchain-community

In [4]:
# import libraries
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS

from langchain.llms import HuggingFacePipeline
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

In [12]:
# Step-1: get text from single OR multiple PDFs
def get_pdf_text(pdf_docs):
    """
    args:
        pdf_docs: list of pdfs
    """
    text = ""

    # iterate through all pdfs
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        # iterate through all pages
        for page in pdf_reader.pages:
            text += page.extract_text()

    return text


# Step-2: get the text chunks
def get_text_chunks(text):
    """
    : return
        a list of chunks of text that we will feed to our model
    """
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
    )
    chunks = text_splitter.split_text(text)
    return chunks


# Step-3: Create Vector store
def get_vectorstore(text_chunks):
    # instructor embeddings
    embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    return vectorstore

def train_model():
    model_id = "gpt2"
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id)

    pipe = pipeline(
    "question-answering", model=model, tokenizer=tokenizer, max_new_tokens=10
    )

    llm = HuggingFacePipeline(pipeline=pipe)

    return llm


In [None]:
if __name__ == "__main__":
    pdf_docs = ["NIPS-2017-attention-is-all-you-need-Paper.pdf"]
    raw_text = get_pdf_text(pdf_docs)

    # Step-2: get the text chunks
    text_chunks = get_text_chunks(raw_text)

    # Step-3
    vector_store = get_vectorstore(text_chunks)
    print("text_chunks: ", vector_store)

    # Step-4
    llm_model = train_model()

    # Step-5


In [None]:
# Step-5
from langchain.chains.question_answering import load_qa_chain

def step5():
    query = "what is Encoder and Decoder Stacks"

    docs = vector_store.similarity_search(query=query, k=3)
    chain = load_qa_chain(llm=llm_model, chain_type="stuff")
    response = chain.run(input_documents=docs, question=query)

    print("response: ", response)

step5()