# Web Scraping
*   HTML structure
*   using requests to fetch Web pages
*   using BeautifulSoup for parsing


In [6]:
!pip install requests
!pip install beautifulsoup4



In [7]:
# https://en.wikipedia.org/wiki/Python_(programming_language)

import requests

url = "https://en.wikipedia.org/wiki/Python_(programming_language)"
response = requests.get(url)

if response.status_code == 200:
    print(response.text[:500])
else:
    print("Error")

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vect


In [10]:
from bs4 import BeautifulSoup

html_content = '<h1>Main title</h1><p>This is title</p><a href = "https://example.com">Click here</a>'
soup = BeautifulSoup(html_content, 'html.parser')

print(soup.h1.text)
print(soup.p.text)

Main title
This is title


# Project: Wikipedia Article Scraper

In [27]:
import requests
from bs4 import BeautifulSoup

# step 1: get wikipedia article URL
def get_wikipedia(topic):
  url = f"https://en.wikipedia.org/wiki/{topic.replace(' ', '_')}"
  response = requests.get(url)

  if response.status_code == 200:
    return response.text
  else:
    print(f"failed to retrieve data. status code: {response.status_code}. check the topic and try again")
    return None

# step 2: extract article title
def get_article_title(soup):
  return soup.find('h1').text

# step 3: extract article summary
def get_article_summary(soup):
  paragraphs = soup.find_all('p')
  for para in paragraphs:
    if para.text.strip():
      return para.text.strip()
  return "No sumamry found"

# step 4: extract headings
def get_event_datetime(soup):
  headings = [heading.text.strip() for heading in soup.find_all(['h2', 'h3', 'h4'])]
  return headings

# step 5: extract related links
def get_related_links(soup):
  links = []
  for a_tag in soup.find_all('a', href = True):
    href = a_tag['href']
    if href.startswith('/wiki/') and ":" not in href:
      links.append(f"https://en.wikipedia.org{a_tag['href']}")
  return list(set(links))[:5]

# step 6: main program
def main():
  topic = input("Enter a topic: ").strip()
  html_content = get_wikipedia(topic)
  if html_content:
    soup = BeautifulSoup(html_content, 'html.parser')
    title = get_article_title(soup)
    summary = get_article_summary(soup)
    headings = get_event_datetime(soup)
    related_links = get_related_links(soup)

    print("\n---- Wikipedia Article Details ----")
    print(f"\nTitle: {title}")
    print(f"\nSummary: {summary}")
    print("\nHeadings:")
    for heading in headings[:5]:
      print(f"  - {heading}")

    print("\nRelated Links:")
    for link in related_links:
      print(link)

if __name__ == "__main__":
  main()


Enter a topic: Porsche

---- Wikipedia Article Details ----

Title: Porsche

Summary: Dr. Ing. h.c. F. Porsche AG, usually shortened to Porsche (German pronunciation: [ˈpɔʁʃə] ⓘ; see below), is a German automobile manufacturer specializing in luxury, high-performance sports cars, SUVs and sedans, headquartered in Stuttgart, Baden-Württemberg, Germany. The company is owned by Volkswagen AG, a controlling stake of which is owned by Porsche Automobil Holding SE, usually shortened to Porsche SE. Porsche's current lineup includes the 718, 911, Panamera, Macan, Cayenne and Taycan.

Headings:
  - Contents
  - History
  - Origin
  - Company logo
  - Developments

Related Links:
https://en.wikipedia.org/wiki/Bundesland_(Germany)
https://en.wikipedia.org/wiki/Quattro_(four-wheel-drive_system)
https://en.wikipedia.org/wiki/Volkswagen_Group_of_America
https://en.wikipedia.org/wiki/Ducati_Motor_Holding
https://en.wikipedia.org/wiki/VW_Electronics_Research_Laboratory
