# Web Scraping using BeautifulSoup + Playwright + Asyncio

This script is designed to scrape news articles from the Nasional section of the Utusan Malaysia website, specifically the Jenayah, Mahkamah, and Tragedi subcategories.

Key Features of the Script:
Dynamic Page Navigation: The script loops through up to 900 or less pages per subcategory using Playwright’s headless Chromium browser.

Content Extraction: For each article, it captures:

- Title of the news article

- Publication date

- Category (tags)

- The originating tab (Jenayah, Mahkamah, or Tragedi)

- Full URL of the article

Data Storage: All data is saved into a CSV file named utusan_nasional_combined.csv.


In [None]:
#6h 45m 2s for 3 tabs 28K data
# Install libraries (only once)
!pip install -q playwright
!playwright install

import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import csv

# Settings
MAX_PAGES = 900  # How many pages to scrape PER tab
TABS = ['jenayah', 'mahkamah', 'tragedi']  # Add or remove tabs as needed
OUTPUT_CSV = 'utusan_nasional_combined.csv'

async def run():
    with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        # CSV column header
        writer.writerow(['Title', 'Date', 'Category', 'Tab', 'URL'])

        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            page = await browser.new_page()

            for tab in TABS:
                print(f"\n📂 Starting tab: {tab}")

                for page_num in range(1, MAX_PAGES + 1):
                    url = f'https://www.utusan.com.my/category/nasional/{tab}/page/{page_num}/'
                    print(f"📄 Scraping {tab.capitalize()} - Page {page_num}")

                    try:
                        await page.goto(url, timeout=500_000)
                        await page.wait_for_timeout(8000)  # Wait for full loading
                        html = await page.content()
                        soup = BeautifulSoup(html, 'html.parser')
                        articles = soup.select('article.jeg_post')
                        print(f"✅ Found {len(articles)} articles on page {page_num}")

                        if not articles:
                            print(f"🛑 No more articles. Breaking early at page {page_num}.")
                            break

                        for article in articles:
                            try:
                                title_tag = article.select_one('h3.jeg_post_title a')
                                timestamp_tag = article.select_one('div.jeg_meta_date')
                                category_tags = article.select('div.jeg_post_category span a')

                                title = title_tag.get_text(strip=True) if title_tag else 'No Title'
                                timestamp = timestamp_tag.get_text(strip=True) if timestamp_tag else 'No Timestamp'
                                categories = ', '.join([tag.get_text(strip=True) for tag in category_tags]) if category_tags else 'No Category'
                                link = title_tag['href'] if title_tag else None
                                full_url = link if link and link.startswith('http') else f"https://www.utusan.com.my{link}" if link else 'No URL'

                                writer.writerow([title, timestamp, categories, tab.capitalize(), full_url])

                            except Exception as e:
                                print(f"⚠️ Error parsing article: {e}")
                                continue

                    except Exception as e:
                        print(f"❌ Error loading page {url}: {e}")
                        continue

            await browser.close()

    print(f"\n🎉 All tabs scraped successfully. Data saved to {OUTPUT_CSV}")

# Run it
await run()


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.1/45.1 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Chromium 136.0.7103.25 (playwright build v1169)[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1169/chromium-linux.zip[22m
[1G167.7 MiB [] 0% 0.0s[0K[1G167.7 MiB [] 0% 103.6s[0K[1G167.7 MiB [] 0% 113.3s[0K[1G167.7 MiB [] 0% 72.1s[0K[1G167.7 MiB [] 0% 49.8s[0K[1G167.7 MiB [] 0% 37.4s[0K[1G167.7 MiB [] 0% 29.9s[0K[1G167.7 MiB [] 0% 21.7s[0K[1G167.7 MiB [] 0% 17.0s[0K[1G167.7 MiB [] 1% 16.1s[0K[1G167.7 MiB [] 1% 11.6s[0K[1G167.7 MiB [] 2% 9.5s[0K[1G167.7 MiB [] 2% 8.8s[0K[1G167.7 MiB [] 2% 7.6s[0K[1G167.7 MiB [] 3% 7.3s[0K[1G167.7 MiB [] 3% 6.6s[0K[1G167.7 MiB [] 4% 6.0s[0K[1G167.7 MiB [] 4% 5.9s[0K[1G167.7 MiB [] 5% 5.6s[0K[1G167.7 MiB [] 5% 5.5s[0K[1G167.7 MiB [] 5% 5.7s[0K[1G167.7 MiB [] 6% 5.3s[0K[1G167.7 MiB [] 7% 5.0s[0K[1G167.7 MiB [] 8% 4.7s[0K[1G167.7 MiB [] 8% 4.