In [None]:
# Web Scraping : data collection, market research, and more
# Parse HTML with BeautifulSoup
# Understanding the structure of HTML
# H1, p, ul, li, a (hyperlink), div, span

In [1]:
pip install beautifulsoup4 requests

Collecting beautifulsoup4
  Downloading beautifulsoup4-4.13.4-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4)
  Downloading soupsieve-2.7-py3-none-any.whl.metadata (4.6 kB)
Downloading beautifulsoup4-4.13.4-py3-none-any.whl (187 kB)
   ---------------------------------------- 0.0/187.3 kB ? eta -:--:--
   --------------------------------------- 187.3/187.3 kB 11.8 MB/s eta 0:00:00
Downloading soupsieve-2.7-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.13.4 soupsieve-2.7
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import requests

url = "https://fr.wikipedia.org/wiki/Python_(langage)"
response = requests.get(url)

if response.status_code == 200:
    html_content = response.text
    print(html_content[:500])  # Print the first 500 characters of the HTML content
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vect


In [3]:
from bs4 import BeautifulSoup

html_content = "<h1>Main Title</h1><p>This is a sample paragraph</p><a href='https://www.example.com'>Click here</a>"
soup = BeautifulSoup(html_content, 'html.parser')

print(soup.h1.text)  # Extract text from the first <h1> tag
print(soup.p.text)   # Extract text from the first <p> tag


Main Title
This is a sample paragraph


In [8]:
# Wikipedia Article Scraper
import requests
from bs4 import BeautifulSoup
import json

TOPIC_SEARCH_HISTORY = "./assets/search_history.json"

# Step 1: Get Wikipedia Article URL
def get_wikipedia_article(topic):
    url = f"https://fr.wikipedia.org/wiki/{topic.replace(' ', '_')}"
    print(f"Accessing to url : {url}...")
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}. Check the topic and try again.")
    return None

# Step 2: Extract Article Title
def get_article_title(soup):
    return soup.find('h1').text

# Step 3 : Extract Article Summary
def get_article_summary(soup):
    paragraphs = soup.find_all('p')
    for p in paragraphs:
        if p.text.strip():
            return p.text.strip()
    return "No summary found."

# Step 4: Extract Headings
def get_headings(soup):
    headings = [heading.text.strip() for heading in soup.find_all(['h2', 'h3', 'h4'])]
    return headings

# Step 5: Extract related links
def get_related_links(soup):
    links = []
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        if href.startswith('/wiki/') and not ":" not in href:  # Exclude links to other namespaces like talk pages
            links.append(f"https://fr.wikipedia.org{href}")
    return list(set(links))[:5]

# Bonus:
def save_topic_file(topic, content):
    try:
        with open(f"./assets/{topic}.txt", "w") as file:
            file.write(content)
        print(f"File successfully created : './assets/{topic}.txt'")
    except Exception as e:
        print(f"An error occurred while saving the file: {e}")
        
def summary_json(title, summary, headings, related_links):
    return {
        "title": title,
        "summary": summary,
        "headings": headings,
        "related_links": related_links
    }
    
def load_json_history():
    try:
        with open(TOPIC_SEARCH_HISTORY, 'r') as json_file:
            json_content = json.load(json_file)
        if json_content:
            return json_content
    except FileNotFoundError:
        print("Search History File not found.")
    except Exception as e:
        print(f"An error occurred while loading the JSON file: {e}")
    return []
    
def save_json_history(topics_data):
    json_content = load_json_history()
    data = {
        "session" : len(json_content) + 1,
        "details" : topics_data
    }
    json_content.append(data)
    try:
        with open(TOPIC_SEARCH_HISTORY, 'w') as json_file:
            json.dump(json_content, json_file, indent=2)
            print("Search History File successfully updated.")
    except Exception as e:
        print(f"An error occured : {e}")

# Step 6: Main Program
def main():
    topics = input("Enter the topics to search on Wikipedia: (separated by '|')").strip().split('|')
    history = []
    for topic in topics:
        page_content = get_wikipedia_article(topic)
        content = f"\n------- {topic} Wikipedia Article Summary -------"
        if page_content:
            soup = BeautifulSoup(page_content, 'html.parser')
            title = get_article_title(soup)
            summary = get_article_summary(soup)
            headings = get_headings(soup)
            related_links = get_related_links(soup)
            content += f"\nTitle: {title}"
            content += f"\nSummary: {summary}"
            content += "\nHeadings:"
            for heading in headings:
                content += f"\n- {heading}"
            content += "\nRelated Links:"
            for link in related_links:
                content += f"\n- {link}"
            print(content)
            save_proposition = input("Do you want to save this topic in a file (yes/no) ?").strip()
            if save_proposition.lower() == 'yes':
                save_topic_file(topic, content)
            history.append(summary_json(title, summary, headings, related_links))
    save_json_history(history)

# Run Program      
if __name__ == "__main__":
    main()

Accessing to url : https://fr.wikipedia.org/wiki/Corinthe...

------- Corinthe Wikipedia Article Summary -------
Title: Corinthe
Summary: Pour les articles homonymes, voir Corinthien.
Headings:
- Sommaire
- Géographie
- Localisation
- Climat
- Voies de communication et transports
- Toponymie
- Histoire
- Antiquité
- Période grecque
- Période romaine
- Moyen Âge
- Empire byzantin
- Principauté latine d'Achaïe
- Despotat de Morée
- Les Hospitaliers
- Empire ottoman
- Époque moderne
- Royaume de Grèce
- Politique et administration
- Démographie
- Économie
- Agriculture
- Tourisme
- Culture locale et patrimoine
- Lieux et monuments
- Personnalités liées à la ville
- Jumelages
- Notes et références
- Notes
- Bibliographie
- Annexes
- Articles connexes
- Liens externes
Related Links:
- https://fr.wikipedia.org/wiki/Mod%C3%A8le:Sources_%C3%A0_lier/Explication
- https://fr.wikipedia.org/wiki/Sp%C3%A9cial:Pages_sp%C3%A9ciales
- https://fr.wikipedia.org/wiki/Sp%C3%A9cial:Mes_discussions
- https: