С сайта Breitbart

In [25]:
import csv
import time
from datetime import datetime, timezone
from urllib.parse import urljoin, urlsplit
import re
import json
import requests
from bs4 import BeautifulSoup
from dateutil import parser as dtp
from pathlib import Path

In [26]:
BASE_URL = "https://www.breitbart.com/politics/"
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (compatible; ResearchBot/1.0; +https://example.com/bot)"
    )
}
CUTOFF = datetime(2025, 6, 1, tzinfo=timezone.utc)


def fetch_page(page: int) -> BeautifulSoup:
    url = BASE_URL if page == 1 else f"{BASE_URL}page/{page}/"
    resp = requests.get(url, headers=HEADERS, timeout=30)
    resp.raise_for_status()
    return BeautifulSoup(resp.text, "html.parser")


def parse_article(card):
    h2 = card.find("h2")
    if not h2 or not h2.a:
        return None
    title = h2.get_text(strip=True)
    url = urljoin(BASE_URL, h2.a["href"])

    time_tag = card.find("time")
    if not time_tag or not time_tag.has_attr("datetime"):
        return None
    pub_dt = dtp.isoparse(time_tag["datetime"])

    author_tag = card.find("address")
    author = author_tag.get_text(strip=True) if author_tag else ""

    return {
        "title": title,
        "url": url,
        "pub_date": pub_dt.isoformat(),
        "author": author,
    }


def crawl_pages(start_page=1, end_page=5):
    results = []

    for page in range(start_page, end_page + 1):
        try:
            soup = fetch_page(page)
        except Exception as e:
            print(f"[!] Failed to load page {page}: {e}")
            continue

        articles = soup.select("section.aList article")
        page_data = []

        for art in articles:
            data = parse_article(art)
            if not data:
                continue

            pub_dt = dtp.isoparse(data["pub_date"])
            if pub_dt < CUTOFF:
                print(f"⚠️  Reached cutoff date on page {page} — stopping.")
                return results + page_data

            page_data.append(data)

        if page_data:
            last_date = min(dtp.isoparse(item["pub_date"]) for item in page_data)
            print(f"✅ Page {page}: {len(page_data)} articles, last date: {last_date.date()}")
        else:
            print(f"⚠️  Page {page}: no valid articles found.")

        results += page_data
        time.sleep(1.5)

    return results


def save_csv(rows, path="breitbart_politics_2025.csv"):
    fieldnames = ["title", "url", "pub_date", "author"]
    with open(path, "a", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        if f.tell() == 0:
            writer.writeheader()
        writer.writerows(rows)

In [None]:
START_PAGE = 26
END_PAGE = 30

data = crawl_pages(START_PAGE, END_PAGE)
save_csv(data)
print(f"\nDone! Saved {len(data)} articles from pages {START_PAGE}–{END_PAGE}.")

In [None]:
INPUT_FILE = "breitbart_politics_2025.csv"
OUTPUT_FILE = "breitbart.csv"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; ResearchBot/1.0; +https://example.com/bot)"
}

def extract_article_text(url):
    try:
        resp = requests.get(url, headers=HEADERS, timeout=30)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "html.parser")

        content_div = soup.find("div", class_="entry-content")
        if not content_div:
            return ""

        paragraphs = content_div.find_all("p")
        text = "\n".join(p.get_text(strip=True) for p in paragraphs)
        return text
    except Exception as e:
        print(f"[!] Failed to fetch {url}: {e}")
        return ""

def enrich_with_text(input_path, output_path):
    with open(input_path, newline='', encoding='utf-8') as infile:
        reader = csv.DictReader(infile)
        rows = list(reader)

    enriched = []
    for i, row in enumerate(rows, 1):
        url = row["url"]
        print(f"[{i}/{len(rows)}] Fetching: {url}")
        article_text = extract_article_text(url)
        row["text"] = article_text
        enriched.append(row)
        time.sleep(1.5)

    fieldnames = list(enriched[0].keys())
    with open(output_path, "w", newline='', encoding='utf-8') as outfile:
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(enriched)

    print(f"\nSaved {len(enriched)} articles to {output_path}.")


enrich_with_text(INPUT_FILE, OUTPUT_FILE)

С сайта Mother Jones

In [None]:
BASE_URL = "https://www.motherjones.com/politics/"
STOP_PATTERN = re.compile(r"/2025/04/")
HEADERS = {"User-Agent": ("Mozilla/5.0 (compatible; ResearchBot/1.0; +https://example.com/bot)"),}
OUT_PATH = "motherjones_politics_2025.csv"
REQUEST_DELAY = 1.5
MAX_PAGES = 30


def fetch_listing(page: int) -> BeautifulSoup:
    url = BASE_URL if page == 1 else f"{BASE_URL}page/{page}/"
    resp = requests.get(url, headers=HEADERS, timeout=30)
    resp.raise_for_status()
    return BeautifulSoup(resp.text, "html.parser")

def extract_article_text(url):
    try:
        resp = requests.get(url, headers=HEADERS, timeout=30)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "html.parser")

        content_div = (
            soup.find("div", class_="article-body")
            or soup.find("div", class_="element-article-body")
            or soup.find("article")
        )
        if not content_div:
            return ""

        paragraphs = content_div.find_all("p")
        return "\n".join(p.get_text(strip=True) for p in paragraphs)

    except Exception as e:
        print(f"Failed to fetch article {url}: {e}")
        return ""

def crawl(start_page: int = 1, max_pages: int = MAX_PAGES) -> list[dict]:
    results: list[dict] = []
    stop = False

    for page in range(start_page, max_pages + 1):
        if stop:
            break

        try:
            soup = fetch_listing(page)
        except Exception as e:
            print(f"Failed to load page {page}: {e}")
            break

        cards = soup.select("li.article-item, li.top-article-item")
        if not cards:
            print(f"Page {page}: no article cards found.")
            break

        for card in cards:
            h = card.find(["h2", "h3"])
            if not h or not h.a:
                continue

            url = urljoin(BASE_URL, h.a["href"])
            if STOP_PATTERN.search(url): # дошли до апреля 2025
                stop = True
                print("Stop pattern reached — ending crawl.")
                break

            title = h.get_text(strip=True)
            text  = extract_article_text(url)

            results.append({"title": title, "url": url, "text": text})

        print(f"Page {page}: collected {len(results)} articles")
        time.sleep(REQUEST_DELAY)

    return results


def save_csv(rows, path=OUT_PATH):
    if not rows:
        print("Nothing to save.")
        return

    fieldnames = ["title", "url", "text"]
    with open(path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(rows)

    print(f"\nSaved {len(rows)} articles to {path}.")

data = crawl(start_page=1, max_pages=MAX_PAGES)
save_csv(data, OUT_PATH)

Собираем статьи Common Dreams

In [None]:
BASE_URL = "https://www.commondreams.org/politics"
STOP_PATTERN = re.compile(r"/2025/04/")
HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; ResearchBot/1.0; +https://example.com/bot)"}
OUT_PATH = "commondreams_politics_2025.csv"
REQUEST_DELAY = 1.5 
MAX_PAGES = 40 


def fetch_listing(page: int) -> BeautifulSoup:
    url = BASE_URL if page == 0 else f"{BASE_URL}?page={page}"
    resp = requests.get(url, headers=HEADERS, timeout=30)
    resp.raise_for_status()
    return BeautifulSoup(resp.text, "html.parser")

def extract_article_text(url):
    try:
        resp = requests.get(url, headers=HEADERS, timeout=30)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "html.parser")

        scripts = soup.find_all("script", type="application/ld+json")
        for script in scripts:
            try:
                data = json.loads(script.string)
                if isinstance(data, dict) and data.get("@type") == "NewsArticle":
                    return data.get("articleBody", "").strip()
                # если это граф
                if isinstance(data, dict) and "@graph" in data:
                    for item in data["@graph"]:
                        if item.get("@type") == "NewsArticle":
                            return item.get("articleBody", "").strip()
            except Exception:
                continue

        print(f"Не найден articleBody в {url}")
        return ""

    except Exception as e:
        print(f"Ошибка при загрузке текста: {e}")
        return ""

def crawl(max_pages=MAX_PAGES) -> list[dict]:
    results: list[dict] = []
    stop = False

    for page in range(0, max_pages):
        if stop:
            break

        try:
            soup = fetch_listing(page)
        except Exception as e:
            print(f"Failed to load page {page}: {e}")
            break

        # каждая новость лежит в <article>
        cards = soup.select("article")
        if not cards:
            print(f"Page {page}: no article cards found.")
            break

        for card in cards:
            h = card.find(["h2", "h3"])
            if not h or not h.a:
                continue

            url = h.a["href"]
            if not url.startswith("http"):
                url = urljoin(BASE_URL, url)

            if STOP_PATTERN.search(url): # дошли до апреля 2025
                stop = True
                print("⚠️  Stop pattern reached — ending crawl.")
                break

            title = h.get_text(strip=True)
            text  = extract_article_text(url)

            results.append({"title": title, "url": url, "text": text})

        print(f"Page {page}: collected {len(results)} articles")
        time.sleep(REQUEST_DELAY)

    return results

def save_csv(rows: list[dict], path: str = OUT_PATH) -> None:
    if not rows:
        print("Nothing to save.")
        return

    with open(path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["title", "url", "text"])
        writer.writeheader()
        writer.writerows(rows)

    print(f"\nSaved {len(rows)} articles to {path}.")


data = crawl(MAX_PAGES)
save_csv(data, OUT_PATH)

In [None]:
BASE_URL = "https://truthout.org/section/politics-elections/"
PAGE_URL = BASE_URL + "page/{page}/"

STOP_PATTERN = re.compile(r"/2025/04/")
MONTHS_WANTED = {"2025/06", "2025/05"} 

HEADERS = {"User-Agent": ("Mozilla/5.0 (compatible; ResearchBot/1.0; +https://example.com/bot)")}

OUT_PATH = Path("truthout_politics_2025_may_june.csv")
REQUEST_DELAY = 1.5
MAX_PAGES = 40


def fetch_listing(page: int = 1) -> BeautifulSoup:
    url = BASE_URL if page == 1 else PAGE_URL.format(page=page)
    resp = requests.get(url, headers=HEADERS, timeout=30)
    resp.raise_for_status()
    return BeautifulSoup(resp.text, "html.parser")


def extract_article_text(url):
    try:
        resp = requests.get(url, headers=HEADERS, timeout=30)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "html.parser")

        for script in soup.find_all("script", type="application/ld+json"):
            try:
                data = json.loads(script.string)
            except (json.JSONDecodeError, TypeError):
                continue

            # Иногда JSON обёрнут в "@graph"
            if isinstance(data, dict) and data.get("@type") == "NewsArticle":
                return data.get("articleBody", "").strip()

            if isinstance(data, dict) and "@graph" in data:
                for item in data["@graph"]:
                    if item.get("@type") == "NewsArticle":
                        return item.get("articleBody", "").strip()

        # fallback — если по какой-то причине JSON не нашёлся
        article = soup.find("article")
        if article:
            paragraphs = article.find_all("p")
            return "\n".join(p.get_text(strip=True) for p in paragraphs)

        return ""
    except Exception as exc:
        print(f"Failed to fetch “{url}”: {exc}")
        return ""


def crawl(max_pages=MAX_PAGES) -> list[dict]:
    collected: list[dict] = []
    for page in range(1, max_pages + 1):

        try:
            soup = fetch_listing(page)
        except Exception as exc:
            print(f"Listing page {page} failed: {exc}")
            break

        cards = soup.select("article")
        if not cards:
            print(f"Page {page}: никаких <article> не нашли, стоп.")
            break

        for card in cards:
            h = card.find(["h2", "h3"])
            if not h or not h.a:
                continue

            url  = h.a.get("href") or ""
            url  = url if url.startswith("http") else urljoin(BASE_URL, url)
            path = urlsplit(url).path

            if STOP_PATTERN.search(path):
                print("Stop-pattern reached → crawl finished.")
                return collected

            # фильтруем только нужные месяцы
            month_part = "/".join(path.split("/")[:3])
            if month_part[1:] not in MONTHS_WANTED:
                continue

            title = h.get_text(strip=True)
            text  = extract_article_text(url)
            collected.append({"title": title, "url": url, "text": text})

        print(f"Page {page}: всего статей собрано {len(collected)}")
        time.sleep(REQUEST_DELAY)

    return collected


def save_csv(rows: list[dict], path=OUT_PATH):
    if not rows:
        print("Nothing to save.")
        return

    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["title", "url", "text"])
        writer.writeheader()
        writer.writerows(rows)

    print(f"\nSaved {len(rows)} articles → «{path}»")


data = crawl()
save_csv(data)