<a href="https://colab.research.google.com/github/LuckyBoy587/Notes-Summarizer/blob/master/Colab_Run.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Notes Summarizer on Colab

This notebook allows you to easily run the Notes Summarizer on Google Colab. It will clone the latest code from GitHub, install dependencies, and process a PDF.

In [None]:
# Setup: Clone repository, install dependencies, and download NLTK data
!git clone https://github.com/LuckyBoy587/Notes-Summarizer.git
%cd Notes-Summarizer
!pip install -r requirements.txt
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

In [None]:
# Import modules
from config import get_model_tokenizer_device, get_device
from text_processing import split_into_topics
from paraphrasing import paraphrase_chunks
from pdf_extraction import extract_topics_from_pdf
from google.colab import files
import os
import torch
# Show device info so you know whether GPU fp16 is being used
print('torch.cuda.is_available():', torch.cuda.is_available())
print('device:', get_device())


In [None]:
def summarize_pdf(pdf_filename, paraphrase=True, paraphrase_kwargs=None):
    # Process PDF: Extract topics, split, paraphrase, and save (use fast sampling for extraction)
    # fast=True uses a small set of sampled pages to estimate font-size thresholds which speeds up large PDFs
    if paraphrase_kwargs is None:
        paraphrase_kwargs = {'batch_size': 16, 'num_beams': 1, 'max_length': 64, 'do_sample': True}
    extracted_text = extract_topics_from_pdf(pdf_filename, fast=True, sample_pages=3)
    topics = split_into_topics(extracted_text)

    output_content = ""
    for topic, chunks in topics.items():
        if paraphrase:
            bullets = paraphrase_chunks(chunks, **paraphrase_kwargs)
        else:
            bullets = chunks
        output_content += f"\n## {topic}\n"
        output_content += "\n".join([f"• {b}" for b in bullets]) + "\n"

    output_filename = pdf_filename.replace('.pdf', '_paraphrased.txt')
    with open(output_filename, 'w', encoding='utf-8') as f:
        f.write(output_content)

    print(f"Output saved to {output_filename}")
    # Download the result
    files.download(output_filename)


In [None]:
# Upload PDF
uploaded = files.upload()
for pdf_filename in uploaded.keys():
    # Run with paraphrasing using faster generation defaults
    summarize_pdf(pdf_filename, paraphrase=True)
    os.remove(pdf_filename)  # Clean up uploaded PDF file
