<a href="https://colab.research.google.com/github/Ishikaaa/PDF-extraction/blob/main/PDF_extraction_falcon_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install libraries
!pip install PyPDF2
!pip install langchain
!pip install InstructorEmbedding
!pip install sentence-transformers==2.2.2
!pip install faiss-gpu
!pip install -U langchain-community
!pip install python-docx

In [2]:
# import libraries
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain_community.vectorstores import FAISS

from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceHub
import re
import os
import time

from PyPDF2 import PdfReader
from docx import Document
from pathlib import Path

In [9]:
def extract_pdf_text(pdf_path):
    """
    Extracts text from a PDF file.

    Args:
        pdf_path (str): Path to the PDF file.

    Returns:
        str: Text extracted from the PDF file.
    """
    text = ""
    pdf_reader = PdfReader(pdf_path)
    for page in pdf_reader.pages:
        text += page.extract_text() or ""
    return text

def extract_docx_text(docx_path):
    """
    Extracts text from a DOCX file.

    Args:
        docx_path (str): Path to the DOCX file.

    Returns:
        str: Text extracted from the DOCX file.
    """
    text = ""
    doc = Document(docx_path)
    for para in doc.paragraphs:
        text += para.text + "\n"
    return text


def extract_txt_text(txt_path):
    """
    Extracts text from a TXT file.

    Args:
        txt_path (str): Path to the TXT file.

    Returns:
        str: Text extracted from the TXT file.
    """
    with open(txt_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text


In [20]:
# Step-1 Load PDF
def extract_text(file_path):
    """
    Extracts text from a file (PDF, DOCX, TXT).

    Args:
        file_path (str): Path to the file.

    Returns:
        str: Text extracted from the file.
    """
    ext = Path(file_path).suffix.lower()
    if ext == '.pdf':
        return extract_pdf_text(file_path)
    elif ext == '.docx':
        return extract_docx_text(file_path)
    elif ext == '.txt':
        return extract_txt_text(file_path)
    else:
        raise ValueError(f"Unsupported file format: {ext}")


# Step-2: get the text chunks
def get_text_chunks(text):
    """
    : return
        a list of chunks of text that we will feed to our model
    """

    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
    )
    chunks = text_splitter.split_text(text)
    return chunks


# Step-3: Text Embedding and Create Vector store
def get_vectorstore(text_chunks):
    embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    return embeddings, vectorstore


# Step-4: Train the model
def retrieval_qa_chain(db, return_source_documents):
    llm = HuggingFaceHub(repo_id="tiiuae/falcon-7b-instruct", model_kwargs={"temperature": 0.2, "max_length": 500, "max_new_tokens": 700})
    qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                           chain_type='stuff',
                                           retriever=db,
                                           return_source_documents=return_source_documents,
                                           )
    return qa_chain


In [None]:
if __name__ == "__main__":
    os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

    ## Step-1 Load PDF
    # pdf_docs = ["HPOODataSheet.pdf"]
    pdf_docs = "Ishika_Resume.docx"
    # pdf_docs = "code_documentation.docx"
    raw_text = extract_text(pdf_docs)

    ## Step-2: get the text chunks
    text_chunks = get_text_chunks(raw_text)

    ## Step-3
    embeddings, vector_store = get_vectorstore(text_chunks)

    ## Step-4
    db = vector_store.as_retriever(search_kwargs={'k': 3})
    bot = retrieval_qa_chain(db, True)


In [None]:
## Step-5
while True:
    query = input("Please enter your response (type 'quit' to exit): ")
    if query.lower() == 'quit':
        print("Goodbye!")
        break
    else:
        start_time = time.time()
        sol = bot(query)
        end_time = time.time()
        answer = sol["result"].split('\nHelpful Answer:')[-1].strip()
        print("Question: ", query)
        print("Answer: ", answer)
        print("Time: ", end_time - start_time)
        print("**************************")