In [2]:
# Install necessary libraries
!pip install PyPDF2 pdf2image pytesseract transformers sentence-transformers faiss-cpu torch

# Install poppler for pdf2image (required for OCR)
!apt-get install poppler-utils

# Install Tesseract with language support (Hindi, Chinese, etc.)
!apt-get install tesseract-ocr tesseract-ocr-hin tesseract-ocr-chi-sim tesseract-ocr-ben

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading sentence_transformers-3.1.1-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.3/245.3 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_cp

In [3]:
# Import required libraries
from PyPDF2 import PdfReader
from pdf2image import convert_from_path
import pytesseract
from transformers import AutoTokenizer, AutoModel, pipeline
import faiss
import numpy as np
import torch

# Step 1: Extract text from Digital PDFs
def extract_text_from_digital(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

# Step 2: Extract text from Scanned PDFs using OCR
def extract_text_from_scanned(pdf_path):
    pages = convert_from_path(pdf_path)
    text = ""
    for page in pages:
        text += pytesseract.image_to_string(page, lang='eng+hin+chi_sim+ben')
    return text

# Step 3: Wrapper function to handle both scanned and digital PDFs
def extract_text(pdf_path, scanned=False):
    if scanned:
        return extract_text_from_scanned(pdf_path)
    else:
        return extract_text_from_digital(pdf_path)

# Step 4: Chunk the text into smaller pieces (handling token limits)
def chunk_text(text, chunk_size=500, overlap=50, max_tokens=512):
    chunks = []
    words = text.split()

    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        # Ensure the token count is within the model's limit
        if len(tokenizer(chunk)['input_ids']) <= max_tokens:
            chunks.append(chunk)
        else:
            chunk = tokenizer.decode(tokenizer(chunk)['input_ids'][:max_tokens], skip_special_tokens=True)
            chunks.append(chunk)

    return chunks

# Step 5: Load the tokenizer and model (MiniLM) for embedding generation
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Step 6: Function to generate embeddings for each chunk of text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)

    # Perform mean pooling, detach from computation graph, convert to numpy, and ensure float32
    embedding = outputs.last_hidden_state.mean(dim=1).detach().numpy().astype('float32')

    return embedding

# Step 7: Initialize FAISS index for semantic search
dimension = 384  # The embedding size for MiniLM
index = faiss.IndexFlatL2(dimension)

# Step 8: Index the embeddings of text chunks in FAISS
def index_embeddings(chunks):
    embeddings = np.vstack([get_embeddings(chunk) for chunk in chunks])

    # Ensure embeddings are of type float32 and add them to FAISS index
    index.add(embeddings.astype('float32'))

    return embeddings

# Step 9: Search FAISS index for relevant chunks based on query
def search_faiss(query, top_k=5):
    query_embedding = get_embeddings(query)

    # Ensure query_embedding is a 2D array and of type float32
    query_embedding = query_embedding.astype('float32')

    # Perform the search in FAISS
    distances, indices = index.search(query_embedding, top_k)

    return indices, distances

# Step 10: Generate answers from relevant chunks using a pre-trained model (T5-small)
generator = pipeline("text2text-generation", model="t5-small")

def generate_answer(question, context):
    input_text = f"question: {question} context: {context}"
    response = generator(input_text, max_length=200)
    return response[0]['generated_text']

# Step 11: Full pipeline for querying the system and generating an answer
def rag_pipeline(pdf_path, query, scanned=False, top_k=5):
    # Step 1: Extract text from the PDF (scanned or digital)
    text = extract_text(pdf_path, scanned)

    # Step 2: Chunk the extracted text
    chunks = chunk_text(text)

    if not chunks:
        return "No chunks were generated from the document."

    # Step 3: Index the chunks in FAISS
    index_embeddings(chunks)

    # Step 4: Search the FAISS index for relevant chunks based on the query
    indices, _ = search_faiss(query, top_k)

    if len(indices[0]) == 0:
        return "No relevant results found in the FAISS index."

    # Step 5: Retrieve the most relevant chunks based on FAISS results
    relevant_chunks = [chunks[i] for i in indices[0] if i < len(chunks)]

    if not relevant_chunks:
        return "No relevant chunks found."

    # Step 6: Concatenate relevant chunks into a context and generate an answer
    context = " ".join(relevant_chunks)
    answer = generate_answer(query, context)

    return answer


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [5]:
# Example usage:
# Define the path to your PDF file (use Google Colab's file system or your local system)
pdf_path = "/content/1721623520399.pdf"
query = "What is the main topic of the document?"

# For a scanned PDF
scanned_pdf_answer = rag_pipeline(pdf_path, query, scanned=True)
print(f"Answer from scanned PDF: {scanned_pdf_answer}")

# For a digital PDF
digital_pdf_answer = rag_pipeline(pdf_path, query, scanned=False)
print(f"Answer from digital PDF: {digital_pdf_answer}")

Answer from scanned PDF: e Analyze user data and provide insights for feature improvements How to Apply: If you are passionate about AI, AR, and mobile app development, we want to hear from you! Send your resume and portfolio (if applicable) to jobs.eveo.in with the position you are applying for in the subject line
Answer from digital PDF:  Analyze user data and provide insights for feature improvements How to Apply: If you are passionate about AI, AR, and mobile app development, we want to hear from you! Send your resume and portfolio (if applicable) to jobs.eveo.in with the position you are applying for in the subject line
