In [1]:
import requests
from bs4 import BeautifulSoup
import csv

# Charger les URLs depuis le fichier
with open("./urls_en.txt", "r") as f:
    urls = [line.strip() for line in f.readlines() if line.strip()]

# Fichier de sortie
output_file = "./scraped_data.csv"

In [2]:
# Fonction pour extraire les informations d'une page
def scrape_page(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        # Extraire le titre
        title = soup.find("title").text.strip() if soup.find("title") else "N/A"

        # Extraire la date (souvent dans un tag time ou meta)
        date = "N/A"
        if soup.find("time"):
            date = soup.find("time").text.strip()
        elif soup.find("meta", {"name": "date"}):
            date = soup.find("meta", {"name": "date"})["content"]

        # Extraire le contenu principal
        content = "N/A"
        main_content = soup.find("div", {"class": "entry-content"}) or soup.find("article") or soup.find("body")
        if main_content:
            content = " ".join(p.text.strip() for p in main_content.find_all("p") if p.text.strip())

        return [title, date, url, content]

    except Exception as e:
        print(f"Erreur lors du scraping de {url}: {e}")
        return ["Erreur", "N/A", url, "N/A"]

In [3]:
# Scraper chaque URL et stocker les résultats
data = [["Title", "Date", "URL", "Content"]]
for url in urls:
    print(f"Scraping: {url}")
    data.append(scrape_page(url))

# Sauvegarde dans un fichier CSV
with open(output_file, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerows(data)

print(f"Données enregistrées dans {output_file}")

Scraping: https://www.state.gov/designation-of-international-cartels/
Scraping: https://www.state.gov/sanctioning-entities-that-have-traded-in-irans-petroleum/
Scraping: https://www.state.gov/sanctioning-service-providers-that-facilitate-irans-crude-oil-trade/
Scraping: https://www.state.gov/on-the-presidents-action-to-protect-americans-from-dangerous-foreign-gang-members/
Scraping: https://www.state.gov/congratulations-to-the-belizean-people-on-their-democratic-election/
Scraping: https://www.state.gov/announcement-of-a-visa-restriction-policy-to-address-the-forced-return-of-uyghurs-and-members-of-other-ethnic-or-religious-groups-with-protection-concerns-to-china/
Scraping: https://www.state.gov/priorities-and-mission-of-the-second-trump-administrations-department-of-state/
Scraping: https://www.state.gov/united-states-renewed-membership-in-the-geneva-consensus-declaration-on-promoting-womens-health-and-strengthening-the-family/
Scraping: https://www.state.gov/statement-of-the-g7-fore