# PDF Text Summarizer
This notebook demonstrates PDF text summarization using BART transformers model.

In [1]:
# Initialize IPython widgets and output
from IPython.display import display, clear_output
import ipywidgets as widgets
widgets.Widget.close_all()
clear_output()

In [None]:
# Install required packages
!pip install --quiet transformers PyPDF2 torch tqdm ipywidgets fpdf2

import os
import requests
from pathlib import Path

print("Setup complete.")

Error downloading DejaVuSansCondensed.ttf: 404 Client Error: Not Found for url: https://github.com/dejavu-fonts/dejavu-fonts/raw/master/ttf/DejaVuSansCondensed.ttf


HTTPError: 404 Client Error: Not Found for url: https://github.com/dejavu-fonts/dejavu-fonts/raw/master/ttf/DejaVuSansCondensed.ttf

In [13]:
# Configure environment variables
import os
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'

In [None]:
import PyPDF2
import torch
from transformers import BartForConditionalGeneration, BartTokenizer
from tqdm.notebook import tqdm
import ipywidgets as widgets
from IPython.display import display, HTML
import io
from fpdf import FPDF
import tempfile
from datetime import datetime

# Load model and tokenizer globally to avoid reloading
model_name = 'facebook/bart-large-cnn'
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

def process_uploaded_file(upload):
    """Handle uploaded PDF file"""
    content = upload['content']
    pdf_file = io.BytesIO(content)
    return pdf_file

def read_pdf(file):
    """Extract text from PDF file"""
    pdf_reader = PyPDF2.PdfReader(file)
    text = ''
    for page in tqdm(pdf_reader.pages, desc="Reading PDF"):
        text += page.extract_text()
    return text

def chunk_text(text, max_chunk_size=1000):
    """Split text into chunks that BART can process"""
    words = text.split()
    chunks = []
    current_chunk = []
    current_size = 0
    
    for word in words:
        if current_size + len(word) + 1 <= max_chunk_size:
            current_chunk.append(word)
            current_size += len(word) + 1
        else:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_size = len(word)
    
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks

def summarize_text(text, max_length=150, min_length=50):
    """Summarize text using BART"""
    # Process text in chunks
    chunks = chunk_text(text)
    summaries = []
    
    for chunk in tqdm(chunks, desc="Summarizing"):
        inputs = tokenizer(chunk, max_length=1024, truncation=True, return_tensors='pt')
        summary_ids = model.generate(inputs['input_ids'],
                                   max_length=max_length,
                                   min_length=min_length,
                                   num_beams=4,
                                   length_penalty=2.0)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)
    
    return ' '.join(summaries)

def create_summary_pdf(original_filename, text, summary):
    """Create PDF with original text and summary"""
    pdf = FPDF()
    pdf.add_page()
    
    try:
        # Add title
        pdf.set_font('Helvetica', 'B', 16)
        pdf.cell(0, 10, 'Document Summary', align='C', new_x="LMARGIN", new_y="NEXT")
        pdf.ln(10)
        
        # Add metadata
        pdf.set_font('Helvetica', 'B', 12)
        pdf.cell(0, 10, f'Original Document: {original_filename}', new_x="LMARGIN", new_y="NEXT")
        pdf.cell(0, 10, f'Generated: {datetime.now().strftime("%Y-%m-%d %H:%M")}', new_x="LMARGIN", new_y="NEXT")
        pdf.ln(10)
        
        # Add summary
        pdf.set_font('Helvetica', 'B', 14)
        pdf.cell(0, 10, 'Summary:', new_x="LMARGIN", new_y="NEXT")
        pdf.set_font('Helvetica', '', 12)
        
        # Handle non-ASCII characters
        clean_summary = summary.encode('ascii', 'replace').decode()
        pdf.multi_cell(0, 10, clean_summary)
        pdf.ln(10)
        
        # Add original text
        pdf.set_font('Helvetica', 'B', 14)
        pdf.cell(0, 10, 'Original Text:', new_x="LMARGIN", new_y="NEXT")
        pdf.set_font('Helvetica', '', 12)
        
        # Handle non-ASCII characters
        clean_text = text.encode('ascii', 'replace').decode()
        pdf.multi_cell(0, 10, clean_text)
        
        # Save to temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
            pdf.output(tmp.name)
            return tmp.name
            
    except Exception as e:
        print(f"Error creating PDF: {str(e)}")
        return None

def create_summary_pdf_ascii(original_filename, text, summary):
    """Fallback PDF creator using ASCII only"""
    pdf = FPDF()
    pdf.add_page()
    
    # Use core fonts
    pdf.set_font('Helvetica', 'B', 16)
    pdf.cell(0, 10, 'Document Summary', align='C', new_x="LMARGIN", new_y="NEXT")
    pdf.ln(10)
    
    # Add metadata
    pdf.set_font('Helvetica', 'B', 12)
    pdf.cell(0, 10, f'Original Document: {original_filename}', new_x="LMARGIN", new_y="NEXT")
    pdf.cell(0, 10, f'Generated: {datetime.now().strftime("%Y-%m-%d %H:%M")}', new_x="LMARGIN", new_y="NEXT")
    pdf.ln(10)
    
    # Add summary
    pdf.set_font('Helvetica', 'B', 14)
    pdf.cell(0, 10, 'Summary:', new_x="LMARGIN", new_y="NEXT")
    pdf.set_font('Helvetica', '', 12)
    pdf.multi_cell(0, 10, summary)
    pdf.ln(10)
    
    # Add original text
    pdf.set_font('Helvetica', 'B', 14)
    pdf.cell(0, 10, 'Original Text:', new_x="LMARGIN", new_y="NEXT")
    pdf.set_font('Helvetica', '', 12)
    pdf.multi_cell(0, 10, text)
    
    # Save to temporary file
    with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
        pdf.output(tmp.name)
        return tmp.name

In [15]:
# Create widgets
upload_button = widgets.FileUpload(
    description='Upload PDF',
    accept='.pdf',
    multiple=False
)

max_length_slider = widgets.IntSlider(
    value=150,
    min=50,
    max=500,
    step=10,
    description='Max Length:',
    style={'description_width': 'initial'}
)

min_length_slider = widgets.IntSlider(
    value=50,
    min=30,
    max=200,
    step=10,
    description='Min Length:',
    style={'description_width': 'initial'}
)

download_button = widgets.Button(
    description='Download Summary',
    disabled=True,
    button_style='success'
)

progress_output = widgets.Output()
summary_output = widgets.Output()

def on_upload_change(change):
    with progress_output:
        progress_output.clear_output()
        summary_output.clear_output()
        download_button.disabled = True
        
        if not upload_button.value:
            return
            
        try:
            # Get the uploaded file
            uploaded_file = upload_button.value[0]
            pdf_file = process_uploaded_file(uploaded_file)
            
            print("Reading PDF file...")
            text = read_pdf(pdf_file)
            
            print("\nGenerating summary...")
            summary = summarize_text(
                text,
                max_length=max_length_slider.value,
                min_length=min_length_slider.value
            )
            
            # Generate summary PDF
            summary_pdf_path = create_summary_pdf(uploaded_file['name'], text, summary)
            
            with summary_output:
                summary_output.clear_output()
                print("Summary:")
                print("="*50)
                print(summary)
                print("\nOriginal text length:", len(text))
                print("Summary length:", len(summary))
            
            # Enable download button
            download_button.disabled = False
            download_button.file_path = summary_pdf_path
            download_button.filename = f"summary_{uploaded_file['name']}"
                
        except Exception as e:
            print(f"An error occurred: {str(e)}")

def on_download_click(b):
    """Handle download button click"""
    try:
        with open(b.file_path, 'rb') as f:
            content = f.read()
        
        # Create downloadable link
        b64 = base64.b64encode(content).decode()
        payload = {
            'filename': b.filename,
            'content': b64
        }
        
        # Display download link
        display(HTML(
            f'<a download="{b.filename}" href="data:application/pdf;base64,{b64}" target="_blank">Click to download {b.filename}</a>'
        ))
    except Exception as e:
        print(f"Download error: {str(e)}")

upload_button.observe(on_upload_change, names='value')
download_button.on_click(on_download_click)

# Create layout container
widget_container = widgets.VBox([
    widgets.HTML("<h3>PDF Summarizer Controls</h3>"),
    upload_button,
    widgets.HBox([max_length_slider, min_length_slider]),
    download_button,
    progress_output,
    summary_output
])

# Clear any existing outputs and display widgets
clear_output(wait=True)
display(widget_container)

VBox(children=(HTML(value='<h3>PDF Summarizer Controls</h3>'), FileUpload(value=(), accept='.pdf', description…