In [1]:
%pip install PyPDF2

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import PyPDF2
import re
import tkinter as tk
from tkinter import ttk, filedialog, messagebox, scrolledtext, simpledialog
from google.cloud import translate_v2 as translate
import pyttsx3
import threading
from pathlib import Path
import os

class PDFBotApp:
    def __init__(self, root):
        self.root = root
        self.root.title("PDF ChatBot")
        self.root.geometry("1200x800")
        
        # Initialize core components
        self.content_by_page = []
        self.annotations = {}
        self.search_history = []
        self.current_file = None
        self.dark_mode = tk.BooleanVar(value=False)
        
        # Configure styles
        self.style = ttk.Style()
        self.configure_styles()
        
        # Initialize text-to-speech engine
        self.engine = pyttsx3.init()
        self.configure_tts()
        
        # Setup GUI components
        self.create_widgets()
        self.setup_bindings()

    def configure_styles(self):
        """Configure ttk styles for light/dark modes"""
        self.style.theme_use('clam')
        self.style.configure('.', font=('Helvetica', 11))
        self.style.configure('TButton', padding=5)
        self.style.configure('Dark.TButton', background='#404040', foreground='white')
        self.style.map('Dark.TButton',
                      background=[('active', '#606060'), ('!disabled', '#404040')],
                      foreground=[('active', 'white'), ('!disabled', 'white')])

    def configure_tts(self):
        """Configure text-to-speech settings"""
        self.engine.setProperty('rate', 150)
        self.engine.setProperty('volume', 0.9)
        if voices := self.engine.getProperty('voices'):
            self.engine.setProperty('voice', voices[0].id)

    def create_widgets(self):
        """Create and arrange all GUI components"""
        # Control Frame
        control_frame = ttk.Frame(self.root)
        control_frame.pack(pady=10, fill=tk.X)

        ttk.Button(control_frame, text="Upload PDF", command=self.upload_pdf).pack(side=tk.LEFT, padx=5)
        ttk.Button(control_frame, text="Toggle Dark Mode", command=self.toggle_dark_mode).pack(side=tk.RIGHT, padx=5)
        
        # Search Frame
        search_frame = ttk.Frame(self.root)
        search_frame.pack(pady=10, fill=tk.X)
        
        self.query_entry = ttk.Entry(search_frame, width=60)
        self.query_entry.pack(side=tk.LEFT, padx=5)
        ttk.Button(search_frame, text="Search", command=self.handle_search).pack(side=tk.LEFT, padx=5)
        ttk.Button(search_frame, text="Clear", command=self.clear_results).pack(side=tk.LEFT, padx=5)
        
        # Main Content Frame
        content_frame = ttk.Frame(self.root)
        content_frame.pack(fill=tk.BOTH, expand=True)

        # Results Panel
        self.results_text = scrolledtext.ScrolledText(content_frame, wrap=tk.WORD, width=80)
        self.results_text.pack(side=tk.LEFT, fill=tk.BOTH, expand=True, padx=5)
        self.results_text.tag_configure('highlight', background='yellow')
        self.results_text.tag_configure('annotation', foreground='green')
        
        # Side Panel
        side_panel = ttk.Frame(content_frame, width=250)
        side_panel.pack(side=tk.RIGHT, fill=tk.Y, padx=5)
        
        ttk.Button(side_panel, text="Annotate Text", command=self.annotate_text).pack(pady=5)
        ttk.Button(side_panel, text="Save Annotations", command=self.save_annotations).pack(pady=5)
        ttk.Button(side_panel, text="Translate Text", command=self.handle_translation).pack(pady=5)
        ttk.Button(side_panel, text="Read Aloud", command=self.handle_read_aloud).pack(pady=5)
        
        # Metadata Panel
        self.metadata_text = scrolledtext.ScrolledText(side_panel, height=10, wrap=tk.WORD)
        self.metadata_text.pack(fill=tk.X, pady=5)
        
        # History Panel
        self.history_text = scrolledtext.ScrolledText(side_panel, height=10, wrap=tk.WORD)
        self.history_text.pack(fill=tk.X, pady=5)
        self.history_text.insert(tk.END, "Search History:\n")

    def setup_bindings(self):
        """Set up keyboard bindings"""
        self.root.bind('<Control-s>', lambda e: self.handle_search())
        self.root.bind('<Control-q>', lambda e: self.root.quit())

    def toggle_dark_mode(self):
        """Toggle between dark and light themes"""
        self.dark_mode.set(not self.dark_mode.get())
        bg = '#2e2e2e' if self.dark_mode.get() else '#f0f0f0'
        fg = 'white' if self.dark_mode.get() else 'black'
        
        self.style.configure('.', background=bg, foreground=fg)
        self.results_text.config(bg=bg, fg=fg, insertbackground=fg)
        self.metadata_text.config(bg=bg, fg=fg, insertbackground=fg)
        self.history_text.config(bg=bg, fg=fg, insertbackground=fg)

    def upload_pdf(self):
        """Handle PDF file upload and processing"""
        file_path = filedialog.askopenfilename(filetypes=[("PDF Files", "*.pdf")])
        if not file_path:
            return
        
        self.current_file = file_path
        self.content_by_page = self.read_pdf(file_path)
        
        if isinstance(self.content_by_page, str) and "Error" in self.content_by_page:
            messagebox.showerror("Error", self.content_by_page)
            return
        
        self.show_metadata(file_path)
        self.clear_results()

    def read_pdf(self, file_path):
        """Read and extract text from PDF file"""
        try:
            content = []
            with open(file_path, 'rb') as f:
                reader = PyPDF2.PdfReader(f)
                for page_num, page in enumerate(reader.pages, 1):
                    text = page.extract_text() or f"[No text found on page {page_num}]"
                    content.append((page_num, text))
            return content
        except Exception as e:
            return f"Error reading PDF: {str(e)}"

    def show_metadata(self, file_path):
        """Display PDF metadata in dedicated panel"""
        try:
            with open(file_path, 'rb') as f:
                reader = PyPDF2.PdfReader(f)
                metadata = reader.metadata
                
            self.metadata_text.delete(1.0, tk.END)
            self.metadata_text.insert(tk.END, "PDF Metadata:\n")
            for key in ['title', 'author', 'creator', 'producer', 'subject']:
                value = getattr(metadata, key, 'N/A')
                self.metadata_text.insert(tk.END, f"{key.capitalize()}: {value}\n")
        except Exception as e:
            messagebox.showerror("Metadata Error", str(e))

    def handle_search(self):
        """Perform search operation with highlighting"""
        query = self.query_entry.get().strip()
        if not query:
            messagebox.showwarning("Search Error", "Please enter a search query")
            return
        
        self.search_history.append(query)
        self.update_history()
        
        results = self.search_content(query)
        self.display_results(results)

    def search_content(self, query):
        """Search through PDF content with proper highlighting"""
        results = []
        flags = re.IGNORECASE
        pattern = re.compile(re.escape(query), flags)
        
        for page_num, text in self.content_by_page:
            matches = []
            for match in pattern.finditer(text):
                start, end = match.start(), match.end()
                snippet_start = max(0, start - 50)
                snippet_end = min(len(text), end + 50)
                snippet = text[snippet_start:snippet_end]
                matches.append((start - snippet_start, end - snippet_start))
                
                results.append({
                    'page': page_num,
                    'snippet': snippet,
                    'matches': matches.copy()
                })
        return results

    def display_results(self, results):
        """Display search results with proper highlighting"""
        self.results_text.delete(1.0, tk.END)
        if not results:
            self.results_text.insert(tk.END, "No results found")
            return
        
        for result in results:
            self.results_text.insert(tk.END, f"\nPage {result['page']}:\n", 'page_header')
            start_idx = self.results_text.index(tk.END)
            self.results_text.insert(tk.END, result['snippet'])
            
            for start, end in result['matches']:
                start_pos = f"{start_idx}+{start}c"
                end_pos = f"{start_idx}+{end}c"
                self.results_text.tag_add('highlight', start_pos, end_pos)

    def update_history(self):
        """Update search history panel"""
        self.history_text.delete(1.0, tk.END)
        self.history_text.insert(tk.END, "Search History:\n")
        for idx, query in enumerate(reversed(self.search_history[-10:]), 1):
            self.history_text.insert(tk.END, f"{idx}. {query}\n")

    def clear_results(self):
        """Clear search results and annotations"""
        self.results_text.delete(1.0, tk.END)
        self.history_text.delete(1.0, tk.END)
        self.history_text.insert(tk.END, "Search History:\n")
        self.search_history.clear()
        self.annotations.clear()

    def annotate_text(self):
        """Add annotation to selected text"""
        try:
            sel_start = self.results_text.index(tk.SEL_FIRST)
            sel_end = self.results_text.index(tk.SEL_LAST)
            selected_text = self.results_text.get(sel_start, sel_end).strip()
            
            if not selected_text:
                raise tk.TclError("No text selected")
            
            page_num = simpledialog.askinteger("Page Number", "Enter page number:", parent=self.root)
            if not page_num:
                return
            
            annotation = simpledialog.askstring("Annotation", "Enter your annotation:", parent=self.root)
            if annotation:
                self.add_annotation(page_num, selected_text, annotation)
                self.results_text.tag_add('annotation', sel_start, sel_end)
                
        except tk.TclError:
            messagebox.showwarning("Annotation Error", "Please select text to annotate")

    def add_annotation(self, page_num, text, annotation):
        """Store annotation in structured format"""
        if page_num not in self.annotations:
            self.annotations[page_num] = []
        self.annotations[page_num].append((text, annotation))
        messagebox.showinfo("Success", "Annotation added successfully")

    def save_annotations(self):
        """Save annotations to file"""
        if not self.annotations:
            messagebox.showwarning("Save Error", "No annotations to save")
            return
        
        file_path = filedialog.asksaveasfilename(
            defaultextension=".txt",
            filetypes=[("Text Files", "*.txt"), ("All Files", "*.*")]
        )
        if not file_path:
            return
        
        try:
            with open(file_path, 'w', encoding='utf-8') as f:
                for page, notes in self.annotations.items():
                    f.write(f"=== Page {page} Annotations ===\n")
                    for text, annotation in notes:
                        f.write(f"Text: {text}\nAnnotation: {annotation}\n\n")
            messagebox.showinfo("Success", "Annotations saved successfully")
        except Exception as e:
            messagebox.showerror("Save Error", str(e))

    def handle_translation(self):
        """Handle text translation using Google Cloud"""
        try:
            sel_start = self.results_text.index(tk.SEL_FIRST)
            sel_end = self.results_text.index(tk.SEL_LAST)
            text = self.results_text.get(sel_start, sel_end).strip()
            
            if not text:
                raise ValueError("No text selected")
            
            target_lang = simpledialog.askstring("Translation", "Enter target language code (e.g., fr):")
            if not target_lang:
                return
            
            translated = self.translate_text(text, target_lang)
            self.results_text.insert(tk.END, f"\n\n[Translation ({target_lang.upper()})]:\n{translated}\n")
            
        except (tk.TclError, ValueError) as e:
            messagebox.showwarning("Translation Error", str(e))
        except Exception as e:
            messagebox.showerror("Translation Error", str(e))

    def translate_text(self, text, target_lang):
        """Perform actual translation using Google Cloud"""
        try:
            client = translate.Client()
            result = client.translate(text, target_language=target_lang)
            return result['translatedText']
        except Exception as e:
            messagebox.showerror("Translation Error", 
                               "Ensure Google Cloud credentials are set.\nError: " + str(e))
            return ""

    def handle_read_aloud(self):
        """Handle text-to-speech functionality"""
        text = self.results_text.get(1.0, tk.END).strip()
        if not text:
            messagebox.showwarning("Read Error", "No text to read")
            return
        
        def speak():
            try:
                self.engine.stop()
                self.engine.say(text)
                self.engine.runAndWait()
            except Exception as e:
                messagebox.showerror("Speech Error", str(e))
        
        threading.Thread(target=speak, daemon=True).start()

if __name__ == "__main__":
    # Check for Google Cloud credentials
    if not os.environ.get("GOOGLE_APPLICATION_CREDENTIALS"):
        messagebox.showwarning("Configuration Warning",
                             "Google Cloud credentials not found.\n"
                             "Translation features will be disabled.")
    
    root = tk.Tk()
    app = PDFBotApp(root)
    root.mainloop()