# Downloading libraries

In [None]:
!apt update > /dev/null
!apt install chromium-chromedriver > /dev/null
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install selenium > /dev/null




W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


cp: '/usr/lib/chromium-browser/chromedriver' and '/usr/bin/chromedriver' are the same file


# Scrape section Ekonomi, Ringgit, Luar Negara

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import os
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Output path in Google Drive
output_path = '/content/drive/MyDrive/ColabNotebooks/HPDP_Project/utusan_articles.csv'

# Constants
BASE_URL = "https://www.utusan.com.my"
MAX_PAGE_TIMEOUT = 300
RETRY_WAIT = 10
MAX_RECORDS = 200000

# Sections to scrape
CATEGORIES = {
    "hartanah": "/category/ringgit/hartanah/",
    "kewangan": "/category/ekonomi/kewangan/",
    "korporat": "/category/ekonomi/korporat/",
    "produk": "/category/ekonomi/produk/",
    "usahawan": "/category/ekonomi/usahawan/",
    "ringgit": "/ringgit/",
    "afrika": "/category/luar-negara/afrika/",
    "amerika": "/category/luar-negara/amerika/",
    "asia": "/category/luar-negara/asia/",
    "asia barat": "/category/luar-negara/asia-barat/",
    "asia tenggara": "/category/luar-negara/asia-tenggara/",
    "eropah": "/category/luar-negara/eropah/"
}

# Categories that do NOT support pagination
NO_PAGINATION = {"ringgit"}

def scrape_page(url):
    for attempt in range(3):
        try:
            print(f"Loading: {url} (Attempt {attempt + 1})")
            response = requests.get(url, timeout=MAX_PAGE_TIMEOUT)
            if response.status_code == 200:
                return response.text
            print(f"⚠️ Error: {response.status_code}")
        except Exception as e:
            print(f"⚠️ Exception: {e}")
        time.sleep(RETRY_WAIT)
    print(f"❌ Failed to load {url}")
    return None

def scrape_category(relative_path, category_name, writer, max_records):
    total_scraped = 0
    page_num = 1

    while total_scraped < max_records:
        # Handle pagination logic
        url = f"{BASE_URL}{relative_path}" if category_name in NO_PAGINATION else f"{BASE_URL}{relative_path}page/{page_num}/"
        html = scrape_page(url)
        if not html:
            break

        soup = BeautifulSoup(html, 'html.parser')
        articles = soup.find_all('article', class_='jeg_post')
        if not articles:
            break

        for article in articles:
            if total_scraped >= max_records:
                break
            title_tag = article.find('h3', class_='jeg_post_title')
            title = title_tag.get_text(strip=True) if title_tag else 'No Title'
            link_tag = title_tag.find('a') if title_tag else None
            full_url = link_tag['href'] if link_tag and 'href' in link_tag.attrs else 'No URL'

            date_tag = article.find('div', class_='jeg_meta_date')
            date = date_tag.get_text(strip=True) if date_tag else 'No Date'

            # Use provided dictionary name as the category
            cat = category_name.capitalize()

            writer.writerow([title, full_url, date, cat])
            total_scraped += 1


        print(f"✅ {category_name.capitalize()} - Page {page_num} done. Total scraped: {total_scraped}")

        if category_name in NO_PAGINATION:
            break  # Exit after one page
        page_num += 1

    print(f"🎯 Finished scraping category: {category_name} - Total: {total_scraped}")

def run():
    with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Title', 'URL', 'Date', 'Category'])
        for name, path in CATEGORIES.items():
            scrape_category(path, name, writer, MAX_RECORDS)
    print("🎉 All categories scraped and saved to Google Drive!")

# Start scraping
run()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Loading: https://www.utusan.com.my/category/ekonomi/kewangan/page/81/ (Attempt 1)
✅ Kewangan - Page 81 done. Total scraped: 1458
Loading: https://www.utusan.com.my/category/ekonomi/kewangan/page/82/ (Attempt 1)
✅ Kewangan - Page 82 done. Total scraped: 1476
Loading: https://www.utusan.com.my/category/ekonomi/kewangan/page/83/ (Attempt 1)
✅ Kewangan - Page 83 done. Total scraped: 1494
Loading: https://www.utusan.com.my/category/ekonomi/kewangan/page/84/ (Attempt 1)
✅ Kewangan - Page 84 done. Total scraped: 1512
Loading: https://www.utusan.com.my/category/ekonomi/kewangan/page/85/ (Attempt 1)
✅ Kewangan - Page 85 done. Total scraped: 1530
Loading: https://www.utusan.com.my/category/ekonomi/kewangan/page/86/ (Attempt 1)
✅ Kewangan - Page 86 done. Total scraped: 1548
Loading: https://www.utusan.com.my/category/ekonomi/kewangan/page/87/ (Attempt 1)
✅ Kewangan - Page 87 done. Total scraped: 1566
Loading: https://www.utusan.com.