<a href="https://colab.research.google.com/github/Ikwuegbu/Git-Checkpoint/blob/main/Python_Web_Scraping_Checkpoint.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install requests beautifulsoup4



In [2]:
#Function to get and parse HTML content from a wikipedia page
import requests
from bs4 import BeautifulSoup

def get_html_content(wiki_url):
    response = requests.get(wiki_url)
    if response.status_code == 200:
        return BeautifulSoup(response.content, 'html.parser')
    else:
        raise Exception(f"Failed to retrieve content. Status code: {response.status_code}")

In [3]:
#Function to extract article title
def extract_article_title(soup):
    # Title of the Wikipedia article is in the <h1> tag
    title = soup.find('h1').text
    return title

In [4]:
#Function to extract article text for each paragraph with their respective headings
def extract_text_with_headings(soup):
    content = {}
    current_heading = None

    # All relevant headings are <h2> and <h3>, and paragraphs are in <p> tags
    for tag in soup.find_all(['h2', 'h3', 'p']):
        if tag.name in ['h2', 'h3']:
            # Update the current heading, stripping references like "[edit]"
            current_heading = tag.text.strip().replace('[edit]', '')
        elif tag.name == 'p':
            # Collect text under the current heading
            if current_heading not in content:
                content[current_heading] = []
            content[current_heading].append(tag.text.strip())

    # Combine the list of paragraphs under each heading into a single string
    for heading in content:
        content[heading] = "\n".join(content[heading])

    return content


In [None]:
#Function to collect every link redirecting to another wikipedia page
def collect_internal_links(soup):
    base_url = "https://en.wikipedia.org"
    links = set()

    for link in soup.find_all('a', href=True):
        href = link['href']
        # Filter for links that start with '/wiki/' (internal Wikipedia links)
        if href.startswith('/wiki/') and not href.startswith('/wiki/Special:'):
            links.add(base_url + href)

    return links

In [None]:
#Wrap all functions into a single function
def scrape_wikipedia_page(wiki_url):
    # Step 1: Get the HTML content
    soup = get_html_content(wiki_url)

    # Step 2: Extract the article title
    title = extract_article_title(soup)

    # Step 3: Extract text for each paragraph with their respective headings
    article_content = extract_text_with_headings(soup)

    # Step 4: Collect every link redirecting to another Wikipedia page
    links = collect_internal_links(soup)

    return {
        'title': title,
        'content': article_content,
        'internal_links': links
    }

In [None]:
#Test the last function on a wikipedia page of your choice.
# Example Wikipedia page to test the function
wiki_url = 'https://en.wikipedia.org/wiki/Web_scraping'

# Test the function
result = scrape_wikipedia_page(wiki_url)

# Display the results
print("Title of the Article:", result['title'])
print("\nArticle Content (Headings and Paragraphs):")
for heading, text in result['content'].items():
    print(f"\n{heading}:")
    print(text[:500] + '...')  # Show the first 500 characters for brevity

print("\nNumber of Internal Links Found:", len(result['internal_links']))
print("First 5 Internal Links:")
for link in list(result['internal_links'])[:5]:
    print(link)