<a href="https://colab.research.google.com/github/Ishikaaa/PDF-extraction/blob/main/PDF_extraction_fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install PyPDF2
!pip install langchain
!pip install InstructorEmbedding
!pip install sentence-transformers==2.2.2
!pip install faiss-gpu
!pip install -U langchain-community

In [None]:
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS

# from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import T5Tokenizer, T5ForConditionalGeneration, pipeline
import re
# import pickle
# import os
import time

In [2]:
# Step-1: get text from single OR multiple PDFs
def get_pdf_text(pdf_docs):
    """
    args:
        pdf_docs: list of pdfs
    """
    text = ""

    # iterate through all pdfs
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        # iterate through all pages
        for page in pdf_reader.pages:
            text += page.extract_text()

    return text


# Step-2: Clean the raw text
def clean_context(context):
    # Remove non-text elements
    # context = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', context)  # Remove email addresses
    # context = re.sub(r'\bhttps?:\/\/\S+\b', '', context)  # Remove URLs
    # context = re.sub(r'\b\d+\b', '', context)  # Remove standalone digits
    # # Add more regex patterns to remove other non-text elements

    # Normalize whitespace
    context = re.sub(r'\s+', ' ', context)
    # Lowercase text
    context = context.lower()

    return context


# Step-3: get the text chunks
def get_text_chunks(text):
    """
    return:
        a list of chunks of text that we will feed to our model
    """

    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
    )
    chunks = text_splitter.split_text(text)
    return chunks


# Step-4: Create Vector store
def get_vectorstore(text_chunks):
    # instructor embeddings
    embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    return vectorstore


def train_model(model_name):
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)

    # Create a pipeline for text2text generation
    generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

    return generator


def step6(vector_store, generator, question):
    begin = time.time()
    docs = vector_store.similarity_search(question)
    # context = docs[0].page_content
    context = " ".join([doc.page_content for doc in docs])

    # Combine the question and context for T5
    input_text = f"question: {question} context: {context}. Provide a detailed answer."

    # Generate the answer with specific parameters
    # result = generator(input_text, max_length=150, num_beams=4, early_stopping=True)
    result = generator(input_text, max_length=500, num_beams=5)


    # Decode the generated text
    answer = result[0]['generated_text']

    end = time.time()
    # print(f"Context: {context}")
    print(f"Question: {question}")
    print(f"Answer: {result}")
    print(f"runtime: {end - begin}")
    print("#########################")

In [None]:
if __name__ == "__main__":
    # Step-1 Load PDF
    # pdf_docs = ["Ishika_Garg_resume.pdf"]
    # pdf_docs = ["NIPS-2017-attention-is-all-you-need-Paper.pdf"]
    pdf_docs = ["HPOODataSheet.pdf"]
    raw_text = get_pdf_text(pdf_docs)

    # Step-2: Clean the raw text
    clean_text = clean_context(raw_text)

    # Step-3: get the text chunks
    text_chunks = get_text_chunks(clean_text)

    # Step-4
    vector_store = get_vectorstore(text_chunks)

    # Step-5: Train model
    # model_filename = 'PDF_extraction_model.pkl'
    # if os.path.exists(model_filename):
    #     with open(model_filename, 'rb') as file:
    #         generator = pickle.load(file)
    #     print("Model loaded successfully!")
    # else:
    #     model_name = "t5-large"
    #     # model_name = "t5-small"
    #     generator = train_model(model_name)
    #     with open(model_filename, 'wb') as file:
    #         pickle.dump(generator, file)
    #     print("Model saved successfully!")
    model_name = "t5-large"
    generator = train_model(model_name)

    # Step-6
    while True:
        question = input("Question (type 'exit' to quit): ")
        if question.lower() == 'exit':
            print("Goodbye!")
            break
        else:
            step6(vector_store, generator, question)