In [1]:
%pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
   ---------------------------------------- 0.0/232.6 kB ? eta -:--:--
   --------------- ------------------------ 92.2/232.6 kB 1.7 MB/s eta 0:00:01
   ---------------------------------------- 232.6/232.6 kB 2.9 MB/s eta 0:00:00
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import PyPDF2
import re
import pyttsx3
from pathlib import Path
import tkinter as tk
from tkinter import filedialog, messagebox, scrolledtext
from tkinter import ttk

def read_pdf(file_path):
    try:
        pdf_content_by_page = []
        with open(file_path, 'rb') as pdf_file:
            reader = PyPDF2.PdfReader(pdf_file)
            for page_num, page in enumerate(reader.pages):
                page_text = page.extract_text() or "[No text found on this page]"
                pdf_content_by_page.append((page_num + 1, page_text))
            return pdf_content_by_page
    except Exception as e:
        return f"Error: {e}"

def get_pdf_metadata(file_path):
    try:
        with open(file_path, 'rb') as pdf_file:
            reader = PyPDF2.PdfReader(pdf_file)
            return reader.metadata
    except Exception as e:
        return f"Error: {e}"

def highlight_query(snippet, query):
    return re.sub(f"({re.escape(query)})", r"\033[1;32m\1\033[0m", snippet, flags=re.IGNORECASE)

def search_query(content_by_page, query):
    results = []
    for page_num, page_text in content_by_page:
        matches = [m.start() for m in re.finditer(re.escape(query), page_text, re.IGNORECASE)]
        for start_idx in matches:
            snippet = page_text[max(0, start_idx - 100):start_idx + 400]
            highlighted_snippet = highlight_query(snippet, query)
            results.append((page_num, highlighted_snippet))
    return results

def save_results(results):
    file_path = filedialog.asksaveasfilename(defaultextension=".txt", filetypes=[("Text Files", "*.txt"), ("All Files", "*.*")])
    if file_path:
        with open(file_path, 'w', encoding='utf-8') as file:
            for i, (page_num, snippet) in enumerate(results, 1):
                file.write(f"Result {i} (Page {page_num}):\n{snippet}...\n\n")
        messagebox.showinfo("Success", "Search results saved successfully.")

def read_aloud(text):
    engine = pyttsx3.init()
    engine.say(text)
    engine.runAndWait()

def chat_with_pdf(content_by_page):
    search_history = []
    search_results = []
    root = tk.Tk()
    root.title("Enhanced PDF Chatbot")
    root.geometry("1000x800")
    root.configure(bg="#f0f0f0")

    style = ttk.Style()
    style.configure("TButton", font=("Arial", 12), padding=5)
    style.configure("TLabel", font=("Arial", 12), background="#f0f0f0")

    def handle_search():
        nonlocal search_results
        query = query_entry.get()
        if not query:
            messagebox.showinfo("PDF Bot", "Please enter a query.")
            return
        if not content_by_page:
            messagebox.showerror("Error", "No valid PDF content loaded.")
            return
        search_results = search_query(content_by_page, query)
        results_text.delete(1.0, tk.END)
        if search_results:
            search_history.append(query)
            history_text.delete(1.0, tk.END)
            history_text.insert(tk.END, "Search History:\n" + "\n".join(search_history))
            for i, (page_num, snippet) in enumerate(search_results, 1):
                results_text.insert(tk.END, f"\nResult {i} (Page {page_num}):\n{snippet}...\n")
        else:
            results_text.insert(tk.END, "PDF Bot: Sorry, nothing found.")

    def handle_clear():
        results_text.delete(1.0, tk.END)
        history_text.delete(1.0, tk.END)
        search_history.clear()
        history_text.insert(tk.END, "Search History:")

    def handle_read_aloud():
        text = results_text.get(1.0, tk.END).strip()
        if text:
            read_aloud(text)
        else:
            messagebox.showinfo("PDF Bot", "No search results to read aloud.")

    def upload_pdf():
        file_path = filedialog.askopenfilename(filetypes=[("PDF Files", "*.pdf")])
        if file_path:
            pdf_content_by_page = read_pdf(file_path)
            if isinstance(pdf_content_by_page, str) and "Error" in pdf_content_by_page:
                messagebox.showerror("Error", pdf_content_by_page)
            else:
                metadata = get_pdf_metadata(file_path)
                metadata_text.delete(1.0, tk.END)
                metadata_text.insert(tk.END, "PDF Metadata:\n")
                for key, value in metadata.items():
                    metadata_text.insert(tk.END, f"{key}: {value}\n")
                chat_with_pdf(pdf_content_by_page)

    ttk.Label(root, text="Upload your PDF:").grid(row=0, column=0, columnspan=3, pady=10)
    ttk.Button(root, text="Upload PDF", command=upload_pdf).grid(row=1, column=0, pady=10)

    query_entry = ttk.Entry(root, width=50)
    query_entry.grid(row=2, column=0, columnspan=3, padx=10, pady=10)

    ttk.Button(root, text="Search", command=handle_search).grid(row=3, column=0, pady=10)
    ttk.Button(root, text="Clear", command=handle_clear).grid(row=3, column=1, pady=10)
    ttk.Button(root, text="Save Results", command=lambda: save_results(search_results)).grid(row=3, column=2, pady=10)
    ttk.Button(root, text="Read Aloud", command=handle_read_aloud).grid(row=4, column=1, pady=10)

    history_text = scrolledtext.ScrolledText(root, wrap=tk.WORD, width=30, height=10)
    history_text.grid(row=5, column=2, rowspan=2, padx=10, pady=10)
    history_text.insert(tk.END, "Search History:")

    results_text = scrolledtext.ScrolledText(root, wrap=tk.WORD, width=90, height=20)
    results_text.grid(row=5, column=0, columnspan=2, padx=10, pady=10)

    metadata_text = scrolledtext.ScrolledText(root, wrap=tk.WORD, width=90, height=8)
    metadata_text.grid(row=6, column=0, columnspan=2, padx=10, pady=10)
    
    ttk.Button(root, text="Exit", command=root.quit).grid(row=7, column=1, pady=10)

    root.mainloop()

if __name__ == "__main__":
    chat_with_pdf(None)
