# AI News Summarizer – Demo Notebook

This notebook demonstrates how to:
- Use Hugging Face transformers to summarize text.
- Extract article text from a URL with `trafilatura`.
- Fall back to a classical extractive method (`sumy`) if needed.

Run the cells below. You can also open and modify the functions for your own pipeline.

In [0]:
# Optional: install requirements in a fresh environment
# !pip install -r ../requirements.txt
import re, io
from typing import List

try:
    import trafilatura
except Exception:
    trafilatura = None

try:
    from sumy.parsers.plaintext import PlaintextParser
    from sumy.nlp.tokenizers import Tokenizer
    from sumy.summarizers.lex_rank import LexRankSummarizer
except Exception:
    PlaintextParser = Tokenizer = LexRankSummarizer = None

from transformers import pipeline
print('Libraries imported.')

In [0]:
def clean_text(text: str) -> str:
    text = re.sub(r"\s+", " ", text or "").strip()
    return text

def chunk_text_for_model(text: str, max_chunk_chars: int = 2500) -> List[str]:
    text = text.strip()
    if len(text) <= max_chunk_chars:
        return [text]
    sentences = re.split(r'(?<=[.!?])\s+', text)
    current, chunks = "", []
    for s in sentences:
        if len(current) + len(s) + 1 <= max_chunk_chars:
            current += (" " if current else "") + s
        else:
            if current:
                chunks.append(current)
            current = s
    if current:
        chunks.append(current)
    return chunks

def load_summarizer(model_name: str = "sshleifer/distilbart-cnn-12-6"):
    return pipeline("summarization", model=model_name, device_map="auto")

def hf_summarize(text: str, model_name: str = "sshleifer/distilbart-cnn-12-6", max_words: int = 220) -> str:
    max_tokens = int(max_words * 1.3)
    min_tokens = max(30, int(max_tokens * 0.4))
    summarizer = load_summarizer(model_name)
    chunks = chunk_text_for_model(text)
    outputs = []
    for ch in chunks:
        out = summarizer(ch, max_length=max_tokens, min_length=min_tokens, do_sample=False, truncation=True)
        outputs.append(out[0]["summary_text"])
    return " ".join(outputs)

def sumy_lexrank_summary(text: str, sentences: int = 5) -> str:
    if not (PlaintextParser and Tokenizer and LexRankSummarizer):
        return ""
    try:
        parser = PlaintextParser.from_string(text, Tokenizer("english"))
        summarizer = LexRankSummarizer()
        sents = summarizer(parser.document, sentences)
        return " ".join(str(s) for s in sents)
    except Exception:
        return ""

print('Utility functions ready.')

In [0]:
# Example A: summarize a URL (if trafilatura is available)
url = "https://example.com/news"
if trafilatura is None:
    print("trafilatura not installed; skipping URL extraction.")
else:
    try:
        downloaded = trafilatura.fetch_url(url)
        article = trafilatura.extract(downloaded) or ""
        article = clean_text(article)
        if len(article) > 100:
            print('Extracted chars:', len(article))
            print('\nHF Summary:\n', hf_summarize(article))
        else:
            print('Extraction yielded very short content; try another URL.')
    except Exception as e:
        print('Extraction error:', e)

In [0]:
# Example B: summarize pasted text
sample_text = (
    "OpenAI's recent advancements in natural language processing have accelerated the adoption of AI across industries. "
    "From customer support automation to content generation, organizations are increasingly relying on transformer-based models. "
    "However, challenges remain, including responsible deployment, evaluation, and governance." 
    " This notebook demonstrates a simple summarization pipeline using transformers with an extractive fallback."
)
sample_text = clean_text(sample_text)
print('Original chars:', len(sample_text))
print('\nHF Summary:\n', hf_summarize(sample_text))
fallback = sumy_lexrank_summary(sample_text)
if fallback:
    print('\nLexRank Fallback Summary:\n', fallback)
else:
    print('\nLexRank not available in this environment.')