# Notes Summarizer (Colab)

This notebook provides a minimal Gradio UI to run the Notes Summarizer in Google Colab.

Run the cells in order. Install the packages in Colab (see the requirements cell). The notebook will import `get_model_tokenizer_device` from the project's `config.py` and will use GPU if available.

Costs: model loading and inference on Colab GPU consumes compute and may be subject to Colab quotas. If you run out-of-memory, reduce max_length/num_return_sequences or use CPU (uncheck use_fp16).

In [None]:
!pip install torch
!pip install transformers
!pip install sentencepiece
!pip install gradio
!pip install PyPDF2

In [None]:
# Clone or update the GitHub repo into Colab and add it to sys.path
# Replace repo_url with your repo URL if different. If the repo is private, mount Drive or use an authenticated method.
repo_url = "https://github.com/LuckyBoy587/Notes-Summarizer.git"
repo_dir = "/content/Notes-Summarizer"

import os, sys, subprocess

print('Repository URL:', repo_url)
if os.path.exists(repo_dir):
    print(f"{repo_dir} already exists. Attempting 'git pull' to update.")
    try:
        subprocess.run(["git", "-C", repo_dir, "pull"], check=True)
        print('Pulled latest changes.')
    except Exception as e:
        print('git pull failed:', e)
else:
    try:
        subprocess.run(["git", "clone", repo_url, repo_dir], check=True)
        print('Cloned into', repo_dir)
    except Exception as e:
        print('git clone failed:', e)

# Add the repo directory to sys.path so config.py is importable
if repo_dir not in sys.path:
    sys.path.insert(0, repo_dir)
print('Added to sys.path:', sys.path[0])

print('\nNotes:')
print('- If the repository is private, you will need to authenticate (use an access token or mount Google Drive with the repo).')
print('- After this cell runs, you should be able to import from config.py (e.g., from config import get_model_tokenizer_device)')


In [None]:
# Imports and a reminder to set working directory in Colab so config.py is importable
import sys, os
from pathlib import Path
import time
print('Ensure the project files (including config.py) are available in the Colab working directory or add the path to sys.path.')
# Example (uncomment in Colab if you mount drive or upload repo):
# sys.path.insert(0, '/content/your-repo-path')


In [None]:
# Helper functions: file loading, cleaning, zip creation
import io, zipfile, re
from typing import List

def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ''
    text = re.sub(r'\r\n', '\n', text)
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = re.sub(r'[ \t]{2,}', ' ', text)
    return text.strip()

def load_text_from_uploaded(file) -> str:
    # file may be a temporary file with .name attribute or a file-like object
    path = getattr(file, 'name', None)
    if path and os.path.exists(path):
        ext = Path(path).suffix.lower()
        if ext in ('.txt', '.md'):
            return Path(path).read_text(encoding='utf-8', errors='ignore')
        if ext == '.pdf':
            try:
                import PyPDF2
            except Exception:
                raise RuntimeError('PyPDF2 is required to extract text from PDFs. Install it in Colab: pip install PyPDF2')
            text = []
            reader = PyPDF2.PdfReader(path)
            for page in reader.pages:
                ptext = page.extract_text() or ''
                text.append(ptext)
            return '\n\n'.join(text)
    try:
        file.seek(0)
    except Exception:
        pass
    try:
        content = file.read()
        if isinstance(content, bytes):
            return content.decode('utf-8', errors='ignore')
        return str(content)
    except Exception as e:
        raise RuntimeError(f'Unable to read uploaded file: {e}')

def make_zip_from_texts(texts: List[str], names: List[str], zip_path: str) -> str:
    with zipfile.ZipFile(zip_path, 'w', compression=zipfile.ZIP_DEFLATED) as z:
        for txt, name in zip(texts, names):
            safe_name = name if name else 'summary.txt'
            if not safe_name.lower().endswith('.txt'):
                safe_name = safe_name + '.txt'
            z.writestr(safe_name, txt)
    return zip_path


def chunk_text(text: str, max_tokens=1024, approx_chars_per_token=4) -> List[str]:
    if not text:
        return []
    max_chars = max_tokens * approx_chars_per_token
    if len(text) <= max_chars:
        return [text]
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + max_chars, len(text))
        if end < len(text):
            sep = text.rfind('\n', start, end)
            if sep <= start:
                sep = text.rfind('. ', start, end)
            if sep > start:
                end = sep + 1
        chunks.append(text[start:end].strip())
        start = end
    return chunks


In [None]:
# Model loader and inference wrapper (uses project's config.get_model_tokenizer_device)
import torch
from typing import Any, List, Tuple

MODEL = None
TOKENIZER = None
DEVICE = None

def load_model(use_fp16_on_cuda=True) -> Tuple[Any, Any, torch.device]:
    global MODEL, TOKENIZER, DEVICE
    if MODEL is not None and TOKENIZER is not None:
        return MODEL, TOKENIZER, DEVICE
    try:
        from config import get_model_tokenizer_device
    except Exception as e:
        raise RuntimeError(f'Failed to import get_model_tokenizer_device from config.py: {e}')
    MODEL, TOKENIZER, DEVICE = get_model_tokenizer_device(use_fp16_on_cuda=use_fp16_on_cuda)
    return MODEL, TOKENIZER, DEVICE

def summarize_texts(texts: List[str], max_length=128, min_length=30, num_return_sequences=1, temperature=1.0, num_beams=4, device=None) -> List[List[str]]:
    if device is None:
        device = DEVICE or torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = MODEL
    tokenizer = TOKENIZER
    if model is None or tokenizer is None:
        raise RuntimeError('Model/tokenizer not loaded. Call load_model() first.')
    results = []
    for text in texts:
        if not text or not text.strip():
            results.append([''])
            continue
        inputs = tokenizer.encode_plus(text, return_tensors='pt', truncation=True)
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs.get('attention_mask', None)
        if attention_mask is not None:
            attention_mask = attention_mask.to(device)
        gen_kwargs = dict(max_length=max_length, min_length=min_length, temperature=temperature, num_return_sequences=num_return_sequences)
        if num_beams and num_beams > 1:
            gen_kwargs['num_beams'] = num_beams
        with torch.no_grad():
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, **gen_kwargs)
        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        grouped = [decoded[i:i+num_return_sequences] for i in range(0, len(decoded), num_return_sequences)]
        results.append(grouped[0] if grouped else decoded)
    return results


In [None]:
# Gradio UI: build blocks and the summarization generator that streams progress and auto-downloads final file
import gradio as gr
import base64
from pathlib import Path
import os

def _make_data_url(path: str) -> tuple[str,str]:
    """Return (data_url, filename) for a file path. Reads bytes and encodes to base64."""
    filename = Path(path).name
    with open(path, 'rb') as fh:
        data = fh.read()
    b64 = base64.b64encode(data).decode('ascii')
    data_url = f"data:application/octet-stream;base64,{b64}"
    return data_url, filename

def summarize_interface(uploaded_files, raw_text, selected_indices, max_length, min_length, num_return_sequences, temperature, num_beams, use_fp16):
    """Generator: yields (display_text, download_path|None, html_auto_download) to stream progress and trigger client download."""
    inputs = []
    names = []
    if uploaded_files:
        for f in uploaded_files:
            try:
                txt = load_text_from_uploaded(f)
            except Exception as e:
                txt = f'<<ERROR reading file: {e}>>'
            inputs.append(clean_text(txt))
            names.append(getattr(f, 'name', 'uploaded'))
    if raw_text and raw_text.strip():
        inputs.append(clean_text(raw_text))
        names.append('pasted_text')

    if not inputs:
        yield ('No inputs provided. Upload files or paste text.', None, '')
        return

    # Optionally filter by indices like '0,2'
    if selected_indices and selected_indices.strip():
        try:
            parts = [p.strip() for p in selected_indices.split(',') if p.strip()!='']
            idxs = [int(p) for p in parts]
            inputs = [inputs[i] for i in idxs if 0 <= i < len(inputs)]
            names = [names[i] for i in idxs if 0 <= i < len(names)]
        except Exception:
            pass

    try:
        load_model(use_fp16_on_cuda=use_fp16)
    except Exception as e:
        yield (f'Error loading model: {e}', None, '')
        return

    summaries = []
    tmp_dir = '/tmp'
    for i, text in enumerate(inputs):
        try:
            res = summarize_texts([text], max_length=max_length, min_length=min_length, num_return_sequences=num_return_sequences, temperature=temperature, num_beams=num_beams)
            summary_text = '\n\n'.join(res[0])
            summaries.append(summary_text)
        except Exception as e:
            summaries.append(f'<<Error during summarization: {e}>>')
        display = []
        for n, s in zip(names, summaries):
            display.append(f'--- {n} ---\n{ s }')
        display_text = '\n\n'.join(display)
        # intermediate yields: no download yet
        yield (display_text, None, '')

    # final output files
    if len(summaries) == 1:
        out_path = f'{tmp_dir}/summary.txt'
        with open(out_path, 'w', encoding='utf-8') as fh:
            fh.write(summaries[0])
        # create base64 data URL for auto-download
        try:
            data_url, filename = _make_data_url(out_path)
            html = f'<a id="dl" href="{data_url}" download="{filename}">Download</a><script>document.getElementById("dl").click();</script>'
        except Exception as e:
            html = ''
        yield ('All done', out_path, html)
    else:
        zip_path = f'{tmp_dir}/summaries.zip'
        make_zip_from_texts(summaries, names, zip_path)
        try:
            data_url, filename = _make_data_url(zip_path)
            html = f'<a id="dl" href="{data_url}" download="{filename}">Download</a><script>document.getElementById("dl").click();</script>'
        except Exception as e:
            html = ''
        yield ('All done', zip_path, html)

with gr.Blocks() as demo:
    gr.Markdown('Upload notes (.txt, .md, .pdf) or paste text. Select files to summarize and press Summarize.')
    with gr.Row():
        file_input = gr.File(file_count='multiple', label='Upload note files')
        text_input = gr.Textbox(lines=8, placeholder='Paste note text here', label='Raw text')
    with gr.Row():
        max_length = gr.Slider(16, 1024, value=128, step=8, label='max_length')
        min_length = gr.Slider(8, 512, value=30, step=1, label='min_length')
    with gr.Row():
        num_return_sequences = gr.Slider(1, 5, value=1, step=1, label='num_return_sequences')
        temperature = gr.Slider(0.1, 2.0, value=1.0, step=0.1, label='temperature')
        num_beams = gr.Slider(1, 8, value=4, step=1, label='num_beams')
    use_fp16 = gr.Checkbox(value=True, label='use_fp16_on_cuda (if GPU)')
    selected_indices = gr.Textbox(lines=1, placeholder='e.g. 0,2 to pick first and third uploaded files (or leave empty)', label='selected_indices')
    summarize_btn = gr.Button('Summarize')
    output = gr.Textbox(label='Summaries (display)')
    download_output = gr.File(label='Download results')
    html_download = gr.HTML(label='Download (auto)')

    summarize_btn.click(fn=summarize_interface, inputs=[file_input, text_input, selected_indices, max_length, min_length, num_return_sequences, temperature, num_beams, use_fp16], outputs=[output, download_output, html_download])



In [None]:
# Launch the Gradio app (execute in Colab after installing requirements)
try:
    demo.launch(share=False)
except Exception as e:
    print('Failed to launch Gradio app:', e)
    print('If running in Colab, ensure you have run the pip install cell and that the notebook has access to project files.')
