In [11]:
import PyPDF2
import sys

def extract_text_from_pdf(pdf_path):
    """
    Extract text from a PDF file page by page and print it.
    
    Args:
        pdf_path (str): Path to the PDF file
    """
    try:
        # Open the PDF file
        with open(pdf_path, 'rb') as file:
            # Create a PDF reader object
            pdf_reader = PyPDF2.PdfReader(file)
            
            # Get total number of pages
            num_pages = len(pdf_reader.pages)
            print(f"Total pages in PDF: {num_pages}\n")
            print("=" * 80)
            
            # Extract text from each page
            for page_num in range(num_pages):
                # Get the page
                page = pdf_reader.pages[page_num]
                
                # Extract text from the page
                text = page.extract_text()
                
                # Print page information and text
                print(f"\n--- Page {page_num + 1} ---\n")
                print(text)
                print("\n" + "=" * 80)
                
    except FileNotFoundError:
        print(f"Error: File '{pdf_path}' not found.")
    except PyPDF2.errors.PdfReadError:
        print(f"Error: '{pdf_path}' is not a valid PDF file or is corrupted.")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    
    pdf_path = "report.pdf"
    extract_text_from_pdf(pdf_path)

Total pages in PDF: 71


--- Page 1 ---

FP25-119-D-Rehnuma
Project Team
Faseeh Iqbal 22I-1856
Ahmad Hasan 22I-1945
Manhab Zafar 22I-1957
Session 2022-2026
Supervised by
Ms. Amna Irum
Co-Supervised by
Dr. Qurut-ul-Ain
Department of Data Science And Artificial Intelligence
National University of Computer and Emerging Sciences
Islamabad, Pakistan
June, 2026


--- Page 2 ---

Contents
1 Introduction 11
1.1 Existing Solutions . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 11
1.2 Problem Statement . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 22
1.3 Scope . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 33
1.4 Modules . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 33
1.4.1 Module 1: Dataset Pipeline . . . . . . . . . . . . . . . . . . . . 33
1.4.2 Module 2: Animation Pipeline . . . . . . . . . . . . . . . . . . . 44
1.4.3 Module 3: Quiz System . . . . . . . . . . . . . . . . . . . . . . . 44
1.4.4 Module 4: Real-Time QA 

In [None]:
import pdfplumber
import re
from typing import Dict, List, Optional
import sys

class PDFTextExtractor:
    """
    A robust PDF text extractor that handles various PDF structures including
    tables, headers, footers, and different layouts.
    """
    
    def __init__(self, pdf_path: str):
        self.pdf_path = pdf_path
        self.pages_data = []
        
    def is_likely_header_footer(self, text: str, threshold: int = 100) -> bool:
        """
        Detect if text is likely a header or footer based on length and patterns.
        
        Args:
            text (str): Text to check
            threshold (int): Character threshold for header/footer
            
        Returns:
            bool: True if likely header/footer
        """
        if not text or len(text.strip()) > threshold:
            return False
            
        # Common header/footer patterns
        patterns = [
            r'^\d+$',  # Just page numbers
            r'page\s*\d+',  # "Page 1", "page 2"
            r'\d+\s*of\s*\d+',  # "1 of 10"
            r'©.*\d{4}',  # Copyright notices
            r'^chapter\s+\d+',  # Chapter headings
        ]
        
        text_lower = text.lower().strip()
        for pattern in patterns:
            if re.search(pattern, text_lower):
                return True
                
        return False
    
    def extract_tables(self, page) -> List[List[List[str]]]:
        """
        Extract tables from a page.
        
        Args:
            page: pdfplumber page object
            
        Returns:
            List of tables (each table is a list of rows)
        """
        tables = []
        try:
            extracted_tables = page.extract_tables()
            if extracted_tables:
                for table in extracted_tables:
                    if table:  # Make sure table is not empty
                        # Clean up None values and empty strings
                        cleaned_table = [
                            [cell.strip() if cell else '' for cell in row]
                            for row in table if row
                        ]
                        tables.append(cleaned_table)
        except Exception as e:
            print(f"Warning: Error extracting tables: {str(e)}")
        
        return tables
    
    def format_table(self, table: List[List[str]]) -> str:
        """
        Format a table as a readable string.
        
        Args:
            table: List of rows (each row is a list of cells)
            
        Returns:
            Formatted table string
        """
        if not table:
            return ""
        
        # Calculate column widths
        col_widths = []
        for col_idx in range(len(table[0])):
            max_width = max(len(str(row[col_idx])) for row in table if col_idx < len(row))
            col_widths.append(min(max_width, 50))  # Cap at 50 chars
        
        # Format table
        formatted = "\n[TABLE]\n"
        separator = "+" + "+".join(["-" * (w + 2) for w in col_widths]) + "+"
        
        for row_idx, row in enumerate(table):
            formatted += separator + "\n"
            cells = []
            for col_idx, cell in enumerate(row):
                if col_idx < len(col_widths):
                    cells.append(f" {str(cell):<{col_widths[col_idx]}} ")
            formatted += "|" + "|".join(cells) + "|\n"
            
            # Add separator after header row
            if row_idx == 0:
                formatted += separator.replace("-", "=") + "\n"
        
        formatted += separator + "\n[/TABLE]\n"
        return formatted
    
    def clean_text(self, text: str) -> str:
        """
        Clean extracted text by removing excessive whitespace and normalizing.
        
        Args:
            text (str): Raw text
            
        Returns:
            Cleaned text
        """
        if not text:
            return ""
        
        # Remove excessive whitespace
        text = re.sub(r'\s+', ' ', text)
        
        # Fix common OCR issues
        text = text.replace('ﬁ', 'fi').replace('ﬂ', 'fl')
        
        # Normalize line breaks
        text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
        
        return text.strip()
    
    def extract_page_content(self, page, page_num: int, 
                            remove_headers_footers: bool = True) -> Dict:
        """
        Extract all content from a single page including text and tables.
        
        Args:
            page: pdfplumber page object
            page_num (int): Page number
            remove_headers_footers (bool): Whether to attempt removing headers/footers
            
        Returns:
            Dictionary containing page data
        """
        page_data = {
            'page_number': page_num,
            'text': '',
            'tables': [],
            'has_tables': False
        }
        
        try:
            # Extract tables first
            tables = self.extract_tables(page)
            if tables:
                page_data['has_tables'] = True
                page_data['tables'] = tables
            
            # Extract text
            text = page.extract_text()
            
            if text:
                # Split into lines for header/footer detection
                lines = text.split('\n')
                
                if remove_headers_footers and len(lines) > 3:
                    # Check first and last lines for headers/footers
                    if self.is_likely_header_footer(lines[0]):
                        lines = lines[1:]
                    if self.is_likely_header_footer(lines[-1]):
                        lines = lines[:-1]
                
                text = '\n'.join(lines)
                page_data['text'] = self.clean_text(text)
        
        except Exception as e:
            print(f"Warning: Error processing page {page_num}: {str(e)}")
        
        return page_data
    
    def extract_all(self, remove_headers_footers: bool = True) -> List[Dict]:
        """
        Extract content from all pages in the PDF.
        
        Args:
            remove_headers_footers (bool): Whether to attempt removing headers/footers
            
        Returns:
            List of page data dictionaries
        """
        try:
            with pdfplumber.open(self.pdf_path) as pdf:
                print(f"Processing PDF: {self.pdf_path}")
                print(f"Total pages: {len(pdf.pages)}\n")
                print("=" * 80)
                
                for page_num, page in enumerate(pdf.pages, start=1):
                    page_data = self.extract_page_content(
                        page, page_num, remove_headers_footers
                    )
                    self.pages_data.append(page_data)
                    
        except Exception as e:
            print(f"Error: Failed to process PDF: {str(e)}")
            return []
        
        return self.pages_data
    
    def print_page(self, page_data: Dict):
        """
        Print formatted page content.
        
        Args:
            page_data (Dict): Page data dictionary
        """
        print(f"\n--- Page {page_data['page_number']} ---\n")
        
        if page_data['text']:
            print(page_data['text'])
        
        if page_data['has_tables']:
            for table_idx, table in enumerate(page_data['tables'], start=1):
                print(f"\n{self.format_table(table)}")
        
        print("\n" + "=" * 80)
    
    def print_all(self):
        """Print all extracted pages."""
        for page_data in self.pages_data:
            self.print_page(page_data)
    
    def get_full_text(self, include_tables: bool = True) -> str:
        """
        Get all text from the PDF as a single string.
        
        Args:
            include_tables (bool): Whether to include formatted tables
            
        Returns:
            Complete text content
        """
        full_text = []
        
        for page_data in self.pages_data:
            if page_data['text']:
                full_text.append(page_data['text'])
            
            if include_tables and page_data['has_tables']:
                for table in page_data['tables']:
                    full_text.append(self.format_table(table))
        
        return '\n\n'.join(full_text)


def main():
    """Main function to run the PDF extractor."""
    if len(sys.argv) < 2:
        print("Usage: python script.py <path_to_pdf_file> [--keep-headers-footers]")
        print("Example: python script.py document.pdf")
        print("\nOptions:")
        print("  --keep-headers-footers    Don't attempt to remove headers/footers")
        sys.exit(1)
    
    pdf_path = sys.argv[1]
    remove_headers_footers = '--keep-headers-footers' not in sys.argv
    
    # Create extractor and process PDF
    extractor = PDFTextExtractor(pdf_path)
    extractor.extract_all(remove_headers_footers=remove_headers_footers)
    
    # Print all pages
    extractor.print_all()
    
    # Optional: Get full text as a single string
    # full_text = extractor.get_full_text()
    # print("\n\n=== FULL DOCUMENT TEXT ===\n")
    # print(full_text)


if __name__ == "__main__":
    main()