<a href="https://colab.research.google.com/github/Harish18010/pdf_summerizer/blob/main/PDF_Summerizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers  pymupdf torch gtts

In [3]:
import fitz
import re
from transformers import pipeline

In [4]:

from gtts import gTTS
import os
def save_tts(text, filename, lang='en'):
    tts = gTTS(text=text, lang=lang)
    tts.save(filename)
    print(f" Audio summary saved as {filename}")


In [None]:
from google.colab import files

uploaded = files.upload()  # Use shift or ctrl to select multiple files
pdf_files = list(uploaded.keys())
print("Uploaded files:", pdf_files)

In [6]:

def extract_and_clean_text(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        page_text = page.get_text()
        text += page_text + "\n"
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [7]:
def chunk_text(text, max_chunk_len=1000):
    chunks = []
    start = 0
    while start < len(text):
        end = start + max_chunk_len
        if end >= len(text):
            chunks.append(text[start:])
            break
        last_period = text.rfind('.', start, end)
        if last_period == -1 or last_period <= start:
            last_period = end-1
        chunks.append(text[start:last_period+1].strip())
        start = last_period + 1
    return chunks

In [8]:
def get_summarizer(model_name="facebook/bart-large-cnn",device=-1):
    return pipeline("summarization", model=model_name,device=device)

In [9]:
def summarize_with_mode(text, summarizer, mode="brief"):
    if mode == "brief":
        max_len, min_len = 80, 30
    elif mode == "detailed":
        max_len, min_len = 200, 100
    elif mode == "bullet":
        max_len, min_len = 150, 70
    else:
        # fallback default
        max_len, min_len = 150, 50

    # Summarize the text
    summary = summarizer(text, max_length=max_len, min_length=min_len, do_sample=False)[0]['summary_text']

    # If bullet mode is selected, it converts sentences to bullet points
    if mode == "bullet":
        # Simple split by '. ' to create bullets, adjust as needed
        sentences = [s.strip() for s in summary.split('. ') if s]
        bullet_summary = "\n".join([f"• {s}." if not s.endswith('.') else f"• {s}" for s in sentences])
        return bullet_summary

    return summary

In [10]:
def multi_stage_summarize_with_mode(text, summarizer, mode="brief", max_chunk_len=1000, group_size=2, final_chunk_threshold=8):
    chunks = chunk_text(text, max_chunk_len=max_chunk_len)
    print(f" Stage 1: Summarizing {len(chunks)} chunks in '{mode}' mode...")

    chunk_summaries = []
    for i, chunk in enumerate(chunks):
        print(f" Summarizing chunk {i+1}/{len(chunks)}...")
        summary = summarize_with_mode(chunk, summarizer, mode=mode)
        chunk_summaries.append(summary)

    # Group summaries as paragraphs
    grouped_paragraphs = []
    for i in range(0, len(chunk_summaries), group_size):
        group = chunk_summaries[i:i+group_size]
        paragraph = "\n\n".join(group)
        grouped_paragraphs.append(paragraph)

    formatted_summary = "\n\n".join(grouped_paragraphs)

    if len(grouped_paragraphs) > final_chunk_threshold:
        print(" Too many chunks — returning paragraph-formatted summary only.")
        return formatted_summary
    else:
        print(" Final summarization stage...")
        final_chunks = chunk_text(formatted_summary, max_chunk_len=800)
        final_summaries = []
        for i, chunk in enumerate(final_chunks):
            print(f" Final summarizing chunk {i+1}/{len(final_chunks)}...")
            summary = summarize_with_mode(chunk, summarizer, mode=mode)
            final_summaries.append(summary)

        return "\n\n".join(final_summaries)


In [None]:


# Load summarizer
import torch

try:
    device = 0 if torch.cuda.is_available() else -1
    summarizer = get_summarizer(device=device)

except RuntimeError:
    print(" GPU error. Falling back to smaller model on CPU.")
    summarizer = get_summarizer( model="sshleifer/distilbart-cnn-12-6", device=-1)

# Summarize text using multi-stage approach
for pdf_path in pdf_files:
    print(f"\n Processing: {pdf_path}")


    try:
        text = extract_and_clean_text(pdf_path)
    except Exception as e:
        print(f" Error reading {pdf_path}: {e}")
        continue


    if not text.strip():
        print(f" No text found in {pdf_path}, skipping.")
        continue

    print("Starting summarization...")
    mode_choice = input("Choose summarization mode (brief/detailed/bullet): ").strip().lower()
    if mode_choice not in ["brief", "detailed", "bullet"]:
        print("Invalid mode choice, defaulting to 'brief'.")
        mode_choice = "brief"


    summary = multi_stage_summarize_with_mode(text, summarizer, mode=mode_choice)

    print(summary)

    base_name = pdf_path.rsplit('.', 1)[0]
    summary_filename = f"{base_name}_summary.txt"
    with open(summary_filename, "w", encoding="utf-8") as f:
        f.write(summary)

    print(f" Summary saved as: {summary_filename}")
    files.download(summary_filename)
    generate_audio = input("Do you want to generate audio summaries? (y/n): ").strip().lower() == 'y'
    if generate_audio:

        audio_filename = f"{base_name}_summary.mp3"
        save_tts(summary, audio_filename)
        files.download(audio_filename)