In [3]:
import requests
from bs4 import BeautifulSoup
import json
import time

BASE_URL = "https://president.mn/mng/"
NUM_PAGES = 272
OUTPUT = {}

headers = {
    "User-Agent": "Mozilla/5.0"
}

In [4]:
scraped_ids = set(OUTPUT.keys())  # Already scraped post IDs

for page in range(1, NUM_PAGES + 1):
    print(f"Processing page {page}...")
    page_url = f"{BASE_URL}?paged={page}"
    
    try:
        resp = requests.get(page_url, headers=headers)
        soup = BeautifulSoup(resp.text, "html.parser")
        articles = soup.find_all("article")
        
        for article in articles:
            post_id = article.get("id")
            if post_id and post_id.startswith("post-"):
                numeric_id = post_id.split("-")[1]
                key = f"p-{numeric_id}"
                
                if key in scraped_ids:
                    print(f"Skipping already scraped {key}")
                    continue
                
                post_url = f"{BASE_URL}?p={numeric_id}"
                
                try:
                    post_resp = requests.get(post_url, headers=headers)
                    post_soup = BeautifulSoup(post_resp.text, "html.parser")
                    entry_div = post_soup.find("div", class_="entry-content")

                    if entry_div:
                        paragraphs = [p.get_text(strip=True) for p in entry_div.find_all("p")]
                        OUTPUT[key] = {
                            "text": paragraphs
                        }
                        scraped_ids.add(key)
                        # print(f"Scraped {key}")
                except Exception as e:
                    print(f"Failed to process post {numeric_id}: {e}")
                    
                time.sleep(0.5)  # Be kind to the server

    except Exception as e:
        print(f"Failed to load page {page}: {e}")
    time.sleep(1)


Processing page 1...
Processing page 2...
Processing page 3...
Processing page 4...
Processing page 5...
Processing page 6...
Processing page 7...
Processing page 8...
Processing page 9...
Processing page 10...
Processing page 11...
Processing page 12...
Processing page 13...
Processing page 14...
Processing page 15...
Processing page 16...
Processing page 17...
Processing page 18...
Processing page 19...
Processing page 20...
Processing page 21...
Processing page 22...
Processing page 23...
Processing page 24...
Processing page 25...
Processing page 26...
Processing page 27...
Processing page 28...
Processing page 29...
Processing page 30...
Processing page 31...
Processing page 32...
Processing page 33...
Processing page 34...
Processing page 35...
Processing page 36...
Processing page 37...
Processing page 38...
Processing page 39...
Processing page 40...
Processing page 41...
Processing page 42...
Processing page 43...
Processing page 44...
Processing page 45...
Processing page 46.

In [5]:
# Save the results
with open("scraped_data/president_news.json", "w", encoding="utf-8") as f:
    json.dump(OUTPUT, f, ensure_ascii=False, indent=2)

print("✅ Done scraping and saved to president_news.json")

✅ Done scraping and saved to president_news.json
