In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

In [2]:
def get_html_content(url):
    response = requests.get(url)
    response.raise_for_status()
    return BeautifulSoup(response.text, "html.parser")

def extract_title(soup):
    return soup.find("h1", id="firstHeading").text

def extract_article_text(soup):
    content = {}
    content_div = soup.find("div", class_="mw-parser-output")
    
    current_heading = "Introduction"
    paragraphs = []

    for tag in content_div.find_all(['h2', 'h3', 'p']):
        if tag.name in ['h2', 'h3']:
            if paragraphs:
                content[current_heading] = paragraphs
                paragraphs = []
            current_heading = tag.get_text(strip=True).replace("[edit]", "")
        elif tag.name == 'p':
            text = tag.get_text(strip=True)
            if text:
                paragraphs.append(text)

    if paragraphs:
        content[current_heading] = paragraphs

    return content

def extract_internal_links(soup):
    base_url = "https://en.wikipedia.org"
    links = set()
    
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        if href.startswith("/wiki/") and not any(prefix in href for prefix in [":", "#"]):
            full_url = urljoin(base_url, href)
            links.add(full_url)
    
    return list(links)

def extract_wikipedia_data(url):
    soup = get_html_content(url)
    title = extract_title(soup)
    text = extract_article_text(soup)
    links = extract_internal_links(soup)

    return {
        "title": title,
        "content_by_heading": text,
        "internal_links": links
    }

if __name__ == "__main__":
    test_url = "https://en.wikipedia.org/wiki/Web_scraping"
    data = extract_wikipedia_data(test_url)

    print("Title:", data["title"])
    print("\nHeadings and Paragraphs:")
    for heading, paragraphs in data["content_by_heading"].items():
        print(f"\n {heading}")
        for p in paragraphs[:2]:  # Show only first 2 paragraphs per heading for brevity
            print("-", p)

    print(f"\nTotal Internal Links Found: {len(data['internal_links'])}")
    print("Sample Links:", data["internal_links"][:5])

Title: Web scraping

Headings and Paragraphs:

 Introduction
- Web scraping,web harvesting, orweb data extractionisdata scrapingused forextracting datafromwebsites.[1]Web scraping software may directly access theWorld Wide Webusing theHypertext Transfer Protocolor a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using abotorweb crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central localdatabaseorspreadsheet, for laterretrievaloranalysis.
- Scraping a web page involves fetching it and then extracting data from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Having fetched, extraction can take place. The content of a page may beparsed, searched and reformatted, and its data copied into a spreadsheet o