<a href="https://colab.research.google.com/github/Ishikaaa/PDF-extraction/blob/main/PDF_extraction_fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install PyPDF2
!pip install langchain
!pip install InstructorEmbedding
!pip install sentence-transformers==2.2.2
!pip install faiss-gpu
!pip install -U langchain-community
!pip install datasets
!pip install transformers
!pip install accelerate

In [3]:
# import libraries
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS

from langchain.llms import HuggingFacePipeline
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

In [4]:
# import libraries
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS

# from langchain.llms import HuggingFacePipeline
# from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import T5Tokenizer, T5ForConditionalGeneration, pipeline

# Step-1: get text from single OR multiple PDFs
def get_pdf_text(pdf_docs):
    """
    args:
        pdf_docs: list of pdfs
    """
    text = ""

    # iterate through all pdfs
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        # iterate through all pages
        for page in pdf_reader.pages:
            text += page.extract_text()

    return text


# Step-2: get the text chunks
def get_text_chunks(text):
    """
    : return
        a list of chunks of text that we will feed to our model
    """
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
    )
    chunks = text_splitter.split_text(text)
    return chunks


# Step-3: Create Vector store
def get_vectorstore(text_chunks):
    # instructor embeddings
    embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    return vectorstore


def train_model(model_name):
    model_name = "t5-large"
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)

    # Create a pipeline for text2text generation
    generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
    return generator


def step5(vector_store, generator, questions):
    for question in questions:
        docs = vector_store.similarity_search(question)
        context = docs[0].page_content

        # Combine the question and context for T5
        input_text = f"question: {question} context: {context}. Provide a detailed answer."

        # Generate the answer with specific parameters
        result = generator(input_text, max_length=150, num_beams=4, early_stopping=True)

        # Decode the generated text
        answer = result[0]['generated_text']

        print(f"Context: {context}")
        print(f"Question: {question}")
        print(f"Answer: {answer}")

In [None]:
if __name__ == "__main__":
    # Step-1 Load PDF
    pdf_docs = ["NIPS-2017-attention-is-all-you-need-Paper.pdf"]
    raw_text = get_pdf_text(pdf_docs)

    # Step-2: get the text chunks
    text_chunks = get_text_chunks(raw_text)

    # Step-3
    vector_store = get_vectorstore(text_chunks)
    print("text_chunks: ", vector_store)

    # Step-4
    # # model_name = "t5-large"
    # model_name = "t5-small"
    # generator = train_model(model_name)

    # # Step-5
    # questions = ["Who is Aidan N. Gomez", "What is encoder and decoder?"]
    # step5(questions)

load INSTRUCTOR_Transformer
max_seq_length  512


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, pipeline, DataCollatorForSeq2Seq
from datasets import Dataset, DatasetDict

def fine_tune_model(train_texts, model_name="t5-large"):
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)

    # Tokenize the dataset
    def preprocess_data(examples):
        inputs = examples['input_text']
        targets = examples['target_text']
        model_inputs = tokenizer(inputs, max_length=512, truncation=True)
        labels = tokenizer(targets, max_length=512, truncation=True)
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    # Split the train_texts into training and validation sets (80/20 split)
    split_index = int(0.8 * len(train_texts))
    train_texts_split = train_texts[:split_index]
    val_texts_split = train_texts[split_index:]

    # Prepare datasets
    train_dataset = Dataset.from_dict({"input_text": train_texts_split, "target_text": train_texts_split})
    val_dataset = Dataset.from_dict({"input_text": val_texts_split, "target_text": val_texts_split})

    train_dataset = train_dataset.map(preprocess_data, batched=True)
    val_dataset = val_dataset.map(preprocess_data, batched=True)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        overwrite_output_dir=True,
        num_train_epochs=3,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy="epoch"
    )

    # Data collator
    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=data_collator
    )

    # Train the model
    trainer.train()

    # Save the model
    model.save_pretrained('./fine_tuned_model')
    tokenizer.save_pretrained('./fine_tuned_model')

    return model, tokenizer

def create_generator(model_name):
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
    return generator


In [None]:
# Step 4: Fine-tune the model
model_name = "t5-large"
model, tokenizer = fine_tune_model(text_chunks, model_name)

# Step 5: Create generator pipeline
generator = create_generator('./fine_tuned_model')

# Step 6


In [None]:
questions = ["Who is Aidan N. Gomez", "What is encoder and decoder stacks?", "What did Ying Cao do?"]
step5(vector_store, generator, questions)