In [1]:
%pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
   ---------------------------------------- 0.0/232.6 kB ? eta -:--:--
   --------------- ------------------------ 92.2/232.6 kB 1.7 MB/s eta 0:00:01
   ---------------------------------------- 232.6/232.6 kB 2.9 MB/s eta 0:00:00
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import PyPDF2
import re
from pathlib import Path
import tkinter as tk
from tkinter import filedialog, messagebox

def read_pdf(file_path):
    try:
        pdf_content_by_page = []
        with open(file_path, 'rb') as pdf_file:
            reader = PyPDF2.PdfReader(pdf_file)
            for page_num, page in enumerate(reader.pages):
                page_text = page.extract_text() or "[No text found on this page]"
                pdf_content_by_page.append((page_num + 1, page_text))
            return pdf_content_by_page
    except FileNotFoundError:
        return "Error: File not found. Please check the file path."
    except PyPDF2.utils.PdfReadError:
        return "Error: The PDF file is corrupted or unreadable."
    except Exception as e:
        return f"An error occurred: {e}"

def get_pdf_metadata(file_path):
    """Retrieve metadata from a PDF file."""
    try:
        with open(file_path, 'rb') as pdf_file:
            reader = PyPDF2.PdfReader(pdf_file)
            metadata = reader.metadata
            return metadata
    except Exception as e:
        return f"An error occurred while retrieving metadata: {e}"

def highlight_query(snippet, query):
    """Highlight the query in the snippet."""
    highlighted = re.sub(f"({re.escape(query)})", r"\033[1;32m\1\033[0m", snippet, flags=re.IGNORECASE)
    return highlighted

def search_query(content_by_page, query):
    """Search for the query in the PDF and return snippets with page numbers."""
    results = []
    for page_num, page_text in content_by_page:
        matches = [m.start() for m in re.finditer(re.escape(query), page_text, re.IGNORECASE)]
        for start_idx in matches:
            snippet = page_text[max(0, start_idx - 100):start_idx + 400]
            highlighted_snippet = highlight_query(snippet, query)
            results.append((page_num, highlighted_snippet))
    return results

def chat_with_pdf(content_by_page):
    search_history = []

    def handle_search():
        query = query_entry.get()
        if not query:
            messagebox.showinfo("PDF Bot", "Please enter a query.")
            return

        if not content_by_page:
            messagebox.showerror("Error", "No valid PDF content loaded.")
            return

        results = search_query(content_by_page, query)
        results_text.delete(1.0, tk.END)

        if results:
            search_history.append(query)
            history_text.delete(1.0, tk.END)
            history_text.insert(tk.END, "Search History:\n" + "\n".join(search_history))
            for i, (page_num, snippet) in enumerate(results, 1):
                results_text.insert(tk.END, f"\nResult {i} (Page {page_num}):\n{snippet}...\n")
        else:
            results_text.insert(tk.END, "PDF Bot: Sorry, I couldn't find anything related to your query in the document.")

    def handle_clear():
        results_text.delete(1.0, tk.END)
        history_text.delete(1.0, tk.END)
        search_history.clear()
        history_text.insert(tk.END, "Search History:")

    def upload_pdf():
        file_path = filedialog.askopenfilename(filetypes=[("PDF Files", "*.pdf")])
        if file_path:
            pdf_content_by_page = read_pdf(file_path)
            if isinstance(pdf_content_by_page, str) and "Error" in pdf_content_by_page:
                messagebox.showerror("Error", pdf_content_by_page)
            else:
                metadata = get_pdf_metadata(file_path)
                metadata_text.delete(1.0, tk.END)
                metadata_text.insert(tk.END, f"PDF Metadata:\n")
                for key, value in metadata.items():
                    metadata_text.insert(tk.END, f"{key}: {value}\n")
                chat_with_pdf(pdf_content_by_page)

    root = tk.Tk()
    root.title("PDF Chatbot")
    root.geometry("1000x800")

    # UI Elements
    tk.Label(root, text="Upload your PDF:", padx=10, pady=5).grid(row=0, column=0, columnspan=3)

    upload_button = tk.Button(root, text="Upload PDF", command=upload_pdf)
    upload_button.grid(row=1, column=0, pady=10)

    query_entry = tk.Entry(root, width=50)
    query_entry.grid(row=2, column=0, columnspan=3, padx=10, pady=10)

    search_button = tk.Button(root, text="Search", command=handle_search)
    search_button.grid(row=3, column=0, pady=10)

    clear_button = tk.Button(root, text="Clear All", command=handle_clear)
    clear_button.grid(row=3, column=1, pady=10)

    # Search History Display
    history_text = tk.Text(root, wrap=tk.WORD, width=30, height=10, padx=10, pady=10)
    history_text.grid(row=4, column=2, rowspan=2, padx=10, pady=10)
    history_text.insert(tk.END, "Search History:")

    # Results Display
    results_text = tk.Text(root, wrap=tk.WORD, width=90, height=20, padx=10, pady=10)
    results_text.grid(row=4, column=0, columnspan=2, padx=10, pady=10)

    # Metadata Display
    metadata_text = tk.Text(root, wrap=tk.WORD, width=90, height=8, padx=10, pady=10)
    metadata_text.grid(row=5, column=0, columnspan=2, padx=10, pady=10)

    root.mainloop()

if __name__ == "__main__":
    chat_with_pdf(None)
