In [11]:
import PyPDF2
import sys

def extract_text_from_pdf(pdf_path):
    """
    Extract text from a PDF file page by page and print it.
    
    Args:
        pdf_path (str): Path to the PDF file
    """
    try:
        # Open the PDF file
        with open(pdf_path, 'rb') as file:
            # Create a PDF reader object
            pdf_reader = PyPDF2.PdfReader(file)
            
            # Get total number of pages
            num_pages = len(pdf_reader.pages)
            print(f"Total pages in PDF: {num_pages}\n")
            print("=" * 80)
            
            # Extract text from each page
            for page_num in range(num_pages):
                # Get the page
                page = pdf_reader.pages[page_num]
                
                # Extract text from the page
                text = page.extract_text()
                
                # Print page information and text
                print(f"\n--- Page {page_num + 1} ---\n")
                print(text)
                print("\n" + "=" * 80)
                
    except FileNotFoundError:
        print(f"Error: File '{pdf_path}' not found.")
    except PyPDF2.errors.PdfReadError:
        print(f"Error: '{pdf_path}' is not a valid PDF file or is corrupted.")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    
    pdf_path = "report.pdf"
    extract_text_from_pdf(pdf_path)

Total pages in PDF: 71


--- Page 1 ---

FP25-119-D-Rehnuma
Project Team
Faseeh Iqbal 22I-1856
Ahmad Hasan 22I-1945
Manhab Zafar 22I-1957
Session 2022-2026
Supervised by
Ms. Amna Irum
Co-Supervised by
Dr. Qurut-ul-Ain
Department of Data Science And Artificial Intelligence
National University of Computer and Emerging Sciences
Islamabad, Pakistan
June, 2026


--- Page 2 ---

Contents
1 Introduction 11
1.1 Existing Solutions . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 11
1.2 Problem Statement . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 22
1.3 Scope . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 33
1.4 Modules . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 33
1.4.1 Module 1: Dataset Pipeline . . . . . . . . . . . . . . . . . . . . 33
1.4.2 Module 2: Animation Pipeline . . . . . . . . . . . . . . . . . . . 44
1.4.3 Module 3: Quiz System . . . . . . . . . . . . . . . . . . . . . . . 44
1.4.4 Module 4: Real-Time QA 

In [2]:
#!/usr/bin/env python3
"""
Clean text extractor from PDF (page by page) for chatbot knowledge base.

- Uses pdfplumber's page.extract_text with tuned tolerances
- Normalizes whitespace (removes extra spaces / weird line breaks)
- Returns a single cleaned text string
"""

import pdfplumber
import re
from pathlib import Path

# <<< CHANGE THIS TO YOUR PDF FILE PATH >>>
PDF_PATH = r"report.pdf"  # e.g. r"C:\Users\you\Documents\sample.pdf"


def normalize_whitespace(text: str) -> str:
    """
    Normalize whitespace for NLP / chatbot use.

    - Join lines into paragraphs.
    - Remove extra spaces.
    - Fix common hyphenation at line breaks: 'pre-\nrecorded' -> 'pre-recorded'.
    """
    if not text:
        return ""

    # Fix hyphenation at line breaks
    text = re.sub(r"-\s*\n\s*", "-", text)

    # Replace remaining newlines within paragraphs with spaces
    # but keep double newlines (paragraph breaks)
    text = text.replace("\r", "")
    text = re.sub(r"\n{2,}", "\n\n", text)  # collapse many blank lines to max 1 blank line

    # Within a paragraph, replace single newlines with spaces
    lines = text.split("\n\n")
    normalized_paragraphs = []
    for block in lines:
        # Turn all internal whitespace in the block into single spaces
        block = re.sub(r"\s+", " ", block).strip()
        if block:
            normalized_paragraphs.append(block)

    # Join paragraphs with a double newline so you still have some structure
    return "\n\n".join(normalized_paragraphs)


def clean_lines_remove_page_numbers(text: str) -> str:
    """
    Remove typical standalone page-number lines like '1', '12', 'Page 3', '3 / 10'.
    """
    cleaned_lines = []
    for line in text.splitlines():
        stripped = line.strip()
        if re.fullmatch(r"\d{1,3}", stripped):
            continue
        if re.fullmatch(r"Page\s+\d{1,3}", stripped, flags=re.IGNORECASE):
            continue
        if re.fullmatch(r"\d{1,3}\s*/\s*\d{1,3}", stripped):
            continue
        cleaned_lines.append(line)
    return "\n".join(cleaned_lines)


def extract_clean_text_from_pdf(
    pdf_path: str,
    header_height_ratio: float = 0.08,
    footer_height_ratio: float = 0.08,
    x_tolerance: float = 1.0,
    y_tolerance: float = 3.0,
) -> str:
    """
    Extract cleaned text from a PDF, page by page.

    - Crops top/bottom margins to drop headers/footers.
    - Uses pdfplumber's text reconstruction with tolerances.
    - Cleans page numbers, then normalizes whitespace globally.
    """
    pdf_path = Path(pdf_path)
    all_pages_raw = []

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            width = page.width
            height = page.height

            top_crop = header_height_ratio * height
            bottom_crop = height - footer_height_ratio * height
            main_region = page.crop((0, top_crop, width, bottom_crop))

            page_text = main_region.extract_text(
                x_tolerance=x_tolerance,
                y_tolerance=y_tolerance,
            ) or ""

            page_text = clean_lines_remove_page_numbers(page_text)
            if page_text.strip():
                all_pages_raw.append(page_text)

    joined_text = "\n\n".join(all_pages_raw)
    cleaned_text = normalize_whitespace(joined_text)
    return cleaned_text


# Example usage in a notebook cell
clean_text = extract_clean_text_from_pdf(PDF_PATH)
print(clean_text)  # preview first 2000 chars


FP25-119-D-Rehnuma Project Team Faseeh Iqbal 22I-1856 Ahmad Hasan 22I-1945 Manhab Zafar 22I-1957 Session 2022-2026 Supervised by Ms. Amna Irum Co-Supervised by Dr. Qurut-ul-Ain Department of Data Science And Artificial Intelligence National University of Computer and Emerging Sciences Islamabad, Pakistan June, 2026

Contents 1 Introduction 11 1.1 Existing Solutions . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 11 1.2 Problem Statement . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 22 1.3 Scope . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 33 1.4 Modules . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 33 1.4.1 Module 1: Dataset Pipeline . . . . . . . . . . . . . . . . . . . . 33 1.4.2 Module 2: Animation Pipeline . . . . . . . . . . . . . . . . . . . 44 1.4.3 Module 3: Quiz System . . . . . . . . . . . . . . . . . . . . . . . 44 1.4.4 Module 4: Real-Time QA System . . . . . . . . . . . . . . . . . 44 1.4.5 Module 5