In [1]:
%pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
   ---------------------------------------- 0.0/232.6 kB ? eta -:--:--
   --------------- ------------------------ 92.2/232.6 kB 1.7 MB/s eta 0:00:01
   ---------------------------------------- 232.6/232.6 kB 2.9 MB/s eta 0:00:00
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import PyPDF2
import re
from pathlib import Path
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
from ttkthemes import ThemedTk

def read_pdf(file_path):
    """Read PDF and return a list of tuples containing page number and text."""
    try:
        pdf_content_by_page = []
        with open(file_path, 'rb') as pdf_file:
            reader = PyPDF2.PdfReader(pdf_file)
            for page_num, page in enumerate(reader.pages):
                page_text = page.extract_text() or "[No text found on this page]"
                pdf_content_by_page.append((page_num + 1, page_text))
            return pdf_content_by_page
    except FileNotFoundError:
        return "Error: File not found. Please check the file path."
    except PyPDF2.utils.PdfReadError:
        return "Error: The PDF file is corrupted or unreadable."
    except Exception as e:
        return f"An error occurred: {e}"

def highlight_query(snippet, query):
    """Highlight the query in the snippet."""
    highlighted = re.sub(f"({re.escape(query)})", r"\033[1;32m\1\033[0m", snippet, flags=re.IGNORECASE)
    return highlighted

def search_query(content_by_page, query):
    """Search for the query in the PDF and return snippets with page numbers."""
    results = []
    for page_num, page_text in content_by_page:
        matches = [m.start() for m in re.finditer(re.escape(query), page_text, re.IGNORECASE)]
        for start_idx in matches:
            snippet = page_text[max(0, start_idx - 100):start_idx + 400]
            highlighted_snippet = highlight_query(snippet, query)
            results.append((page_num, highlighted_snippet))
    return results

def chat_with_pdf(content_by_page):
    def handle_search():
        query = query_entry.get()
        if not query:
            messagebox.showinfo("PDF Bot", "Please enter a query.")
            return

        if not content_by_page:
            messagebox.showerror("Error", "No valid PDF content loaded.")
            return

        results = search_query(content_by_page, query)
        results_text.delete(1.0, tk.END)

        if results:
            for i, (page_num, snippet) in enumerate(results, 1):
                results_text.insert(tk.END, f"\nResult {i} (Page {page_num}):\n{snippet}...\n")
        else:
            results_text.insert(tk.END, "PDF Bot: Sorry, I couldn't find anything related to your query in the document.")
    
    def upload_pdf():
        file_path = filedialog.askopenfilename(filetypes=[("PDF Files", "*.pdf")])
        if file_path:
            pdf_content_by_page = read_pdf(file_path)
            if isinstance(pdf_content_by_page, str) and "Error" in pdf_content_by_page:
                messagebox.showerror("Error", pdf_content_by_page)
            else:
                chat_with_pdf(pdf_content_by_page)

    def toggle_theme():
        if root.get_theme() == 'arc':
            # Dark theme
            root.set_theme('equilux')
            style.configure('Custom.TButton', foreground='white', background='#2e2e2e')
            style.configure('Custom.TFrame', background='#1e1e1e')
            style.configure('Custom.TLabel', foreground='white', background='#1e1e1e')
            results_text.configure(bg='#2e2e2e', fg='white', insertbackground='white')
            query_entry.configure(bg='#2e2e2e', fg='white')
            status_bar.configure(background='#1e1e1e', foreground='white')
            main_frame.configure(style='Custom.TFrame')
            title_label.configure(foreground='#00ff9d')  # Neon green for title in dark mode
        else:
            # Light theme
            root.set_theme('arc')
            style.configure('Custom.TButton', foreground='black', background='#f0f0f0')
            style.configure('Custom.TFrame', background='#ffffff')
            style.configure('Custom.TLabel', foreground='black', background='#ffffff')
            results_text.configure(bg='white', fg='black', insertbackground='black')
            query_entry.configure(bg='white', fg='black')
            status_bar.configure(background='#f0f0f0', foreground='black')
            main_frame.configure(style='Custom.TFrame')
            title_label.configure(foreground='#2c5282')  # Deep blue for title in light mode

    root = ThemedTk(theme="arc")
    root.title("PDF Chatbot")
    root.geometry("1000x800")
    
    style = ttk.Style()
    style.configure('Custom.TButton', padding=10, font=('Helvetica', 10, 'bold'))
    style.configure('Custom.TFrame', padding=10)
    style.configure('Custom.TLabel', font=('Helvetica', 10))

    # Gradient background frame
    main_frame = ttk.Frame(root, style='Custom.TFrame')
    main_frame.pack(fill=tk.BOTH, expand=True, padx=20, pady=20)

    # Header frame with modern design
    header_frame = ttk.Frame(main_frame)
    header_frame.pack(fill=tk.X, pady=(0, 20))

    title_label = ttk.Label(header_frame, text="PDF Chatbot", font=('Helvetica', 28, 'bold'), foreground='#2c5282')
    title_label.pack(side=tk.LEFT)

    theme_button = ttk.Button(header_frame, text="🌓 Toggle Theme", style='Custom.TButton', command=toggle_theme)
    theme_button.pack(side=tk.RIGHT)

    # Upload frame with modern button
    upload_frame = ttk.Frame(main_frame)
    upload_frame.pack(fill=tk.X, pady=(0, 20))

    upload_button = ttk.Button(upload_frame, text="📁 Upload PDF", style='Custom.TButton', command=upload_pdf)
    upload_button.pack(side=tk.LEFT, padx=(0, 10))

    # Search frame with rounded entry
    search_frame = ttk.Frame(main_frame)
    search_frame.pack(fill=tk.X, pady=(0, 20))

    query_entry = ttk.Entry(search_frame, width=70, font=('Helvetica', 12))
    query_entry.pack(side=tk.LEFT, padx=(0, 10))

    search_button = ttk.Button(search_frame, text="🔍 Search", style='Custom.TButton', command=handle_search)
    search_button.pack(side=tk.LEFT)

    # Results frame with modern text widget
    results_frame = ttk.Frame(main_frame)
    results_frame.pack(fill=tk.BOTH, expand=True)

    results_text = tk.Text(
        results_frame,
        wrap=tk.WORD,
        font=('Helvetica', 11),
        padx=15,
        pady=15,
        relief=tk.FLAT,
        borderwidth=0,
        selectbackground='#4a9eff',
        selectforeground='white'
    )
    results_text.pack(fill=tk.BOTH, expand=True)

    # Modern scrollbar
    scrollbar = ttk.Scrollbar(results_frame, orient=tk.VERTICAL, command=results_text.yview)
    scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
    results_text.configure(yscrollcommand=scrollbar.set)

    # Modern status bar
    status_bar = ttk.Label(main_frame, text="Ready", relief=tk.FLAT, anchor=tk.W, style='Custom.TLabel')
    status_bar.pack(fill=tk.X, side=tk.BOTTOM, pady=(10, 0))

    root.mainloop()

if __name__ == "__main__":
    chat_with_pdf(None)
