<a href="https://colab.research.google.com/github/KathituCodes/Wikipedia-Scraper/blob/main/Wikipedia_Web_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup

# Function to get and parse HTML content
def get_html_content(url):
    response = requests.get(url)
    if response.status_code == 200:
        return BeautifulSoup(response.content, 'html.parser')
    else:
        raise Exception(f"Failed to retrieve content. Status code: {response.status_code}")

# Function to extract article title
def extract_article_title(soup):
    title = soup.find("h1", {"id": "firstHeading"}).text
    return title

# Function to extract article text with headings
def extract_article_text(soup):
    content = {}
    for heading in soup.find_all(['h2', 'h3']):
        heading_text = heading.get_text(strip=True)
        content[heading_text] = []
        for sibling in heading.find_next_siblings():
            if sibling.name == 'h2' or sibling.name == 'h3':
                break
            if sibling.name == 'p':
                content[heading_text].append(sibling.get_text(strip=True))
    return content

# Function to collect links redirecting to Wikipedia pages
def collect_wikipedia_links(soup):
    links = set()
    for a in soup.find_all('a', href=True):
        href = a['href']
        if href.startswith('/wiki/') and ':' not in href:
            links.add(f"https://en.wikipedia.org{href}")
    return links

# Function to scrape Wikipedia page and produce detailed output
def scrape_wikipedia_page(url):
    soup = get_html_content(url)

    title = extract_article_title(soup)
    content = extract_article_text(soup)
    links = collect_wikipedia_links(soup)

    # Number of sections
    sections = list(content.keys())
    num_sections = len(sections)

    # First 3 sections
    first_3_sections = sections[:3]

    # First paragraph of the first section
    first_section = sections[0] if sections else None
    first_paragraph = content[first_section][0] if first_section and content[first_section] else None

    # Number of Wikipedia links
    num_links = len(links)

    # First 5 Wikipedia links
    first_5_links = list(links)[:5]

    # Print detailed output
    print(f"Title: {title}")
    print(f"Number of sections: {num_sections}")
    print(f"First 3 sections: {first_3_sections}")
    print(f"First paragraph of {first_section}:")
    if first_paragraph:
        print(first_paragraph)
    else:
        print("No paragraphs found.")
    print(f"Number of Wikipedia links: {num_links}")
    print(f"First 5 Wikipedia links:")
    for link in first_5_links:
        print(link)

    return {
        "title": title,
        "num_sections": num_sections,
        "first_3_sections": first_3_sections,
        "first_paragraph": first_paragraph,
        "num_links": num_links,
        "first_5_links": first_5_links
    }

# Example usage
if __name__ == "__main__":
    wikipedia_url = "https://en.wikipedia.org/wiki/Python_(programming_language)"  # Example page
    data = scrape_wikipedia_page(wikipedia_url)
    print(data)


Title: Python (programming language)
Number of sections: 30
First 3 sections: ['Contents', 'History', 'Design philosophy and features']
First paragraph of Contents:
No paragraphs found.
Number of Wikipedia links: 843
First 5 Wikipedia links:
https://en.wikipedia.org/wiki/Autoregressive_model
https://en.wikipedia.org/wiki/Speakeasy_(computational_environment)
https://en.wikipedia.org/wiki/GPT-J
https://en.wikipedia.org/wiki/Inductive_bias
https://en.wikipedia.org/wiki/Open-source_software_security
{'title': 'Python (programming language)', 'num_sections': 30, 'first_3_sections': ['Contents', 'History', 'Design philosophy and features'], 'first_paragraph': None, 'num_links': 843, 'first_5_links': ['https://en.wikipedia.org/wiki/Autoregressive_model', 'https://en.wikipedia.org/wiki/Speakeasy_(computational_environment)', 'https://en.wikipedia.org/wiki/GPT-J', 'https://en.wikipedia.org/wiki/Inductive_bias', 'https://en.wikipedia.org/wiki/Open-source_software_security']}
