In [None]:
# Install Required Libraries
!pip install PyPDF2 sentence-transformers faiss-cpu transformers torch

import os
import PyPDF2
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm

# Step 1: Preprocess the PDF File
def preprocess_pdf(file_path):
    """
    Extract text from a PDF file and split it into chunks.
    """
    print("Preprocessing PDF...")
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()

    # Split the text into smaller chunks
    chunk_size = 200  # Define a reasonable chunk size
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
    print(f"Extracted {len(chunks)} chunks from the PDF.")
    return chunks

# Step 2: Generate Embeddings for Chunks
def generate_embeddings(chunks):
    """
    Generate embeddings for each chunk using a pre-trained Sentence Transformer model.
    """
    print("Generating embeddings...")
    model = SentenceTransformer('all-MiniLM-L6-v2')  # Small, efficient embedding model
    embeddings = model.encode(chunks, show_progress_bar=True)
    return embeddings

# Step 3: Index Embeddings with FAISS
def index_embeddings(embeddings):
    """
    Create a FAISS index for efficient similarity search.
    """
    print("Indexing embeddings...")
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)  # Use L2 distance for similarity
    index.add(embeddings)
    return index

# Step 4: Load Pretrained Language Model
def load_language_model():
    
    print("Loading language model...")
    tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
    model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")
    return tokenizer, model

# Step 5: Answer Questions Using RAG Pipeline
def answer_question(question, chunks, embeddings_index, embeddings_model, tokenizer, llm_model):
    
    print(f"Answering question: {question}")

    # Step 5.1: Encode the question
    question_embedding = embeddings_model.encode([question])

    # Step 5.2: Retrieve top-k relevant chunks
    k = 3  # Number of relevant chunks to retrieve
    distances, indices = embeddings_index.search(question_embedding, k)
    retrieved_chunks = [chunks[i] for i in indices[0]]
    context = " ".join(retrieved_chunks)

    # Step 5.3: Generate an answer using the language model
    input_text = f"Context: {context}\nQuestion: {question}\nAnswer:"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    outputs = llm_model.generate(**inputs, max_length=150)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return answer

# Main Function
def main(pdf_file_path):
    """
    Main function to run the RAG pipeline.
    """
    # Step 1: Preprocess the PDF
    chunks = preprocess_pdf(pdf_file_path)

    # Step 2: Generate embeddings
    embeddings = generate_embeddings(chunks)

    # Step 3: Index embeddings
    embeddings_index = index_embeddings(embeddings)

    # Step 4: Load the language model
    tokenizer, llm_model = load_language_model()

    # Step 5: Answer questions
    while True:
        question = input("Enter your question (type 'exit' to quit): ")
        if question.lower() == 'exit':
            break

        answer = answer_question(
            question=question,
            chunks=chunks,
            embeddings_index=embeddings_index,
            embeddings_model=SentenceTransformer('all-MiniLM-L6-v2'),
            tokenizer=tokenizer,
            llm_model=llm_model
        )
        print(f"Answer: {answer}\n")

# Run the Pipeline
if __name__ == "__main__":
    # Upload a PDF file in Google Colab
    from google.colab import files
    uploaded = files.upload()
    pdf_file_path = list(uploaded.keys())[0]

    # Run the main function
    main(pdf_file_path)