In [5]:
import requests
from bs4 import BeautifulSoup
import json
import time
import os

BASE_URL = "https://news.mn/mng/category/niigem/page/"
NUM_PAGES = 168
DATA_FILE = "scraped_data/news_mn.json"

headers = {
    "User-Agent": "Mozilla/5.0"
}

# Load existing data if present
if os.path.exists(DATA_FILE):
    with open(DATA_FILE, "r", encoding="utf-8") as f:
        OUTPUT = json.load(f)
else:
    OUTPUT = []

scraped_links = set(item["link"] for item in OUTPUT)

In [2]:
NUM_PAGES=1

In [6]:
for page in range(1, NUM_PAGES + 1):
    print(f"Processing list page {page}...")
    page_url = f"{BASE_URL}{page}/"

    try:
        resp = requests.get(page_url, headers=headers)
        soup = BeautifulSoup(resp.text, "html.parser")
        articles = soup.find_all("h2", class_="entry-title")

        for h2 in articles:
            a_tag = h2.find("a")
            if not a_tag:
                continue

            link = a_tag["href"]

            if link in scraped_links:
                print(f"Skipping already scraped {link}")
                continue

            try:
                post_resp = requests.get(link, headers=headers)
                post_soup = BeautifulSoup(post_resp.text, "html.parser")

                title_tag = post_soup.find("h2", class_="entry-title")
                title = title_tag.get_text(strip=True) if title_tag else "No Title Found"

                content_div = post_soup.find("div", class_="single-content entry-content uk-clearfix")
                if content_div:
                    paragraphs = [p.get_text(strip=True) for p in content_div.find_all("p")]
                else:
                    paragraphs = []

                post_data = {
                    "link": link,
                    "title": title,
                    "data": paragraphs
                }

                OUTPUT.append(post_data)
                scraped_links.add(link)

                # print(f"Scraped: {title}")

            except Exception as e:
                print(f"❌ Failed to process article {link}: {e}")

            time.sleep(0.5)

    except Exception as e:
        print(f"❌ Failed to load page {page}: {e}")

    time.sleep(1)




Processing list page 1...
Processing list page 2...
Processing list page 3...
Processing list page 4...
Processing list page 5...
Processing list page 6...
Processing list page 7...
Processing list page 8...
Processing list page 9...
Processing list page 10...
Processing list page 11...
Processing list page 12...
Processing list page 13...
Processing list page 14...
Processing list page 15...
Processing list page 16...
Processing list page 17...
Processing list page 18...
Processing list page 19...
Processing list page 20...
Processing list page 21...
Processing list page 22...
Processing list page 23...
Processing list page 24...
Processing list page 25...
Processing list page 26...
Processing list page 27...
Processing list page 28...
Processing list page 29...
Processing list page 30...
Processing list page 31...
Processing list page 32...
Processing list page 33...
Processing list page 34...
Processing list page 35...
Processing list page 36...
Processing list page 37...
Processing

In [7]:
# Save all data after scraping
with open(DATA_FILE, "w", encoding="utf-8") as f:
    json.dump(OUTPUT, f, ensure_ascii=False, indent=2)

print("✅ Done scraping news.mn!")

✅ Done scraping news.mn!
