In [4]:
!pip install PyPDF2 pdfplumber sentence-transformers faiss-cpu transformers
from google.colab import files
import pdfplumber
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import pipeline
import warnings
import logging

# Suppress warnings and logs
warnings.filterwarnings("ignore")
logging.getLogger("transformers").setLevel(logging.ERROR)

# Step 1: Upload the PDF
uploaded = files.upload()
if not uploaded:
    raise ValueError("No file uploaded. Please upload a PDF file.")

pdf_file_name = list(uploaded.keys())[0]

# Step 2: Extract text from PDF
def extract_text_from_pdf(file_path):
    try:
        with pdfplumber.open(file_path) as pdf:
            text = ""
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:  # Check if text is not None
                    text += page_text + "\n"  # Add newline for better separation
        if not text.strip():
            raise ValueError("The PDF appears to contain no extractable text.")
        return text
    except Exception as e:
        raise ValueError(f"An error occurred while extracting text: {str(e)}")

pdf_text = extract_text_from_pdf(pdf_file_name)

# Step 3: Chunk the text
def chunk_text(text, chunk_size=300):
    sentences = text.split(". ")
    chunks, current_chunk = [], []
    current_length = 0

    for sentence in sentences:
        sentence_length = len(sentence.split())
        if current_length + sentence_length <= chunk_size:
            current_chunk.append(sentence)
            current_length += sentence_length
        else:
            chunks.append(". ".join(current_chunk) + ".")
            current_chunk = [sentence]
            current_length = sentence_length

    if current_chunk:
        chunks.append(". ".join(current_chunk) + ".")

    return chunks

chunks = chunk_text(pdf_text)

# Step 4: Generate embeddings for chunks
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(chunks)

# Step 5: Create and populate FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

# Step 6: Search for relevant chunks
def search_query(query, top_k=3):
    query_embedding = model.encode([query])
    distances, indices = index.search(np.array(query_embedding), top_k)
    return [chunks[i] for i in indices[0]]

# Step 7: Generate a response using Hugging Face
generator = pipeline("text2text-generation", model="google/flan-t5-large")

def generate_response(query, relevant_chunks):
    context = "\n".join(relevant_chunks)
    max_context_tokens = 300
    if len(context.split()) > max_context_tokens:
        context = " ".join(context.split()[:max_context_tokens])

    prompt = (
        f"Use the context below to answer the query:\n\n"
        f"Context: {context}\n\n"
        f"Query: {query}\n\n"
        f"Answer:"
    )
    response = generator(prompt, max_length=150, do_sample=False, truncation=True)
    return response[0]['generated_text']

# Interactive Query Input
while True:
    query = input("\nQuery: ")
    if query.lower() == 'exit':
        break
    relevant_chunks = search_query(query)
    response = generate_response(query, relevant_chunks)
    print(f"Response: {response}")




Saving Tables-Charts-and-Graphs_to_RAG_pipeline.pdf to Tables-Charts-and-Graphs_to_RAG_pipeline.pdf

Query: From page 2 get the exact unemployment information based on type of degree input
Response: Education

Query: From page 6 get the tabular data 
Response: Yearly U.S. GDP by Industry (in millions of dollars)

Query: From page 6 get the data
Response: U.S. Bureau of Labor Statistics

Query: What is the family budget allocation for transportation
Response: 15%

Query: Compare the unemployment rates of individuals with a high school diploma and a bachelor's degree
Response: Individuals with a high school diploma have a higher unemployment rate than individuals with a high school diploma.

Query: what is the title of the pdf
Response: Tables, Charts, and Graphs with Examples from History, Economics, Education, Psychology, Urban Affairs and Everyday Life

Query: what is the important point in the pdf
Response: We use charts and graphs to visualize data.  This data can either be generate