In [4]:
import requests
from bs4 import BeautifulSoup
from transformers import pipeline

# Load the summarization pipeline globally
summarizer_pipeline = pipeline("summarization", model="t5-small", tokenizer="t5-small")

Device set to use cpu


In [5]:
def get_webpage_text(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status
        soup = BeautifulSoup(response.text, 'html.parser')

        # Remove script and style tags (they don't contain visible text)
        for tag in soup(['script', 'style', 'noscript']):
            tag.decompose()

        # Get all visible text
        text = soup.get_text(separator=' ', strip=True)
        return text
    except Exception as e:
        print(f"Error fetching or parsing the webpage: {e}")
        return ""

In [9]:
def summarize_text(text, max_chunk_len=500):
    # T5 has a token limit, so we split large text into smaller chunks
    chunks = [text[i:i+max_chunk_len] for i in range(0, len(text), max_chunk_len)]
    summaries = []

    for chunk in chunks:
        input_text = "summarize: " + chunk
        summary = summarizer_pipeline(input_text, do_sample=False)[0]['summary_text']
        summaries.append(summary)
    
    return ' '.join(summaries)

In [7]:
def summarize_webpage(url):
    print(f"Fetching content from: {url}")
    text = get_webpage_text(url)
    if not text:
        print("No content to summarize.")
        return
    
    print("\nGenerating summary...\n")
    summary = summarize_text(text)
    print("Summary:\n")
    print(summary)

In [10]:

test_url = "https://bytescout.com/blog/32-bit-single-precision-floating-point.html"
summarize_webpage(test_url)

Fetching content from: https://bytescout.com/blog/32-bit-single-precision-floating-point.html


Your max_length is set to 200, but your input_length is only 132. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=66)



Generating summary...



Your max_length is set to 200, but your input_length is only 147. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=73)
Your max_length is set to 200, but your input_length is only 158. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=79)
Your max_length is set to 200, but your input_length is only 142. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=71)
Your max_length is set to 200, but your input_length is only 133. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=66)


Summary:

ByteScout SDK products are sunsetting as we focus on our new & improved solutions . Introducing PDF.co . Try pdf.co for document processing via Web APIs Learn More Introducing MuPDF . free signup PDF Merger API PDF Splitter API HTML to PDF API PDF Editor API Document Parser PDF Converter API PDF Extractor API Barcode Generator API File Uploader API SDK – Extract PDF to Excel, CSV, JSON, Text, XML . PDF to HTML with layout preserved PDF Renderer SDK . convert PDF to PNG, JPG, TIFF, BMP, EMF formats . read 1D and 2D barcodes from image and PDF files . PDF Generator, PDF to HTML, PDF Generator for JS [3 in 1] Barcode Suite . Extract data from documents, PDF, images, Excel on your desktop or web applications . Sensitive Data Suite – Detect, Remove, Analyze Your Documents for Sensive Data and PII (self-hosted cloud) API Server – Sec. ure and scalable REST API server that you can install on-premises Customers Buy Pricing Request a Quote Contact Sales Customization Local Resellers L