In [None]:
pip install selenium


Collecting selenium
  Downloading selenium-4.32.0-py3-none-any.whl.metadata (7.5 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.32.0-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m105.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.30.0-py3-none-any.whl (499 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.2/499.2 kB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio_websocket-0.12.2-py3-none-any.whl (21 kB)
Downloading outcome-1.3.0.post0-py2.py3

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import csv
import time
import random
import os

# Setup Selenium ChromeDriver (headless mode)
def setup_driver():
    options = Options()
    options.add_argument('--headless')  # Comment this out to see browser window
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    service = Service()
    driver = webdriver.Chrome(service=service, options=options)
    return driver

# Scrape a single subcategory
def scrape_subcategory(driver, category, subcategory, writer, max_pages=1400, max_records=50000):
    base_url = f'https://www.utusan.com.my/category/{category}/{subcategory}/page/'
    page_num = 1
    total_scraped = 0

    while total_scraped < max_records and page_num <= max_pages:
        url = f"{base_url}{page_num}/"
        print(f"🔄 Loading {category}/{subcategory} - Page {page_num}: {url}")
        try:
            driver.get(url)
            time.sleep(random.uniform(1, 3))  # Allow page to load

            articles = driver.find_elements(By.CLASS_NAME, 'jeg_post')
            if not articles:
                print(f"⚠️ No articles found on page {page_num} for {subcategory}. Stopping.")
                break

            for article in articles:
                if total_scraped >= max_records:
                    break

                try:
                    title_elem = article.find_element(By.CLASS_NAME, 'jeg_post_title')
                    title = title_elem.text.strip()
                    link = title_elem.find_element(By.TAG_NAME, 'a').get_attribute('href')
                except:
                    title, link = 'No Title', 'No URL'

                try:
                    date = article.find_element(By.CLASS_NAME, 'jeg_meta_date').text.strip()
                except:
                    date = 'No Date'

                writer.writerow([title, link, date, category, subcategory])
                total_scraped += 1

            print(f"✅ Done page {page_num} of {subcategory}. Total scraped: {total_scraped}")
            page_num += 1

        except Exception as e:
            print(f"❌ Error on {category}/{subcategory} page {page_num}: {e}")
            page_num += 1
            time.sleep(1)

def run_full_scraper():
    driver = setup_driver()

    filename = 'utusan_full_scrape.csv'
    csv_exists = os.path.exists(filename)

    with open(filename, 'a', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        if not csv_exists:
            writer.writerow(['Title', 'URL', 'Date', 'Category', 'Subcategory'])

        # Gaya category and its subcategories
        gaya_subcategories = [
            'agama', 'agro', 'anakku', 'deko', 'fesyen',
            'gajet', 'hiburan', 'keluarga-wanita', 'kesihatan', 'pendidikan'
        ]
        for subcat in gaya_subcategories:
            scrape_subcategory(driver, 'gaya', subcat, writer)

        # Nasional > Komuniti
        scrape_subcategory(driver, 'nasional', 'komuniti', writer)

    driver.quit()
    print("🎉 Full scraping complete!")

# Run it
run_full_scraper()


🔄 Loading gaya/agama - Page 1: https://www.utusan.com.my/category/gaya/agama/page/1/
✅ Done page 1 of agama. Total scraped: 18
🔄 Loading gaya/agama - Page 2: https://www.utusan.com.my/category/gaya/agama/page/2/
✅ Done page 2 of agama. Total scraped: 36
🔄 Loading gaya/agama - Page 3: https://www.utusan.com.my/category/gaya/agama/page/3/
✅ Done page 3 of agama. Total scraped: 54
🔄 Loading gaya/agama - Page 4: https://www.utusan.com.my/category/gaya/agama/page/4/
✅ Done page 4 of agama. Total scraped: 72
🔄 Loading gaya/agama - Page 5: https://www.utusan.com.my/category/gaya/agama/page/5/
✅ Done page 5 of agama. Total scraped: 90
🔄 Loading gaya/agama - Page 6: https://www.utusan.com.my/category/gaya/agama/page/6/
✅ Done page 6 of agama. Total scraped: 108
🔄 Loading gaya/agama - Page 7: https://www.utusan.com.my/category/gaya/agama/page/7/
✅ Done page 7 of agama. Total scraped: 126
🔄 Loading gaya/agama - Page 8: https://www.utusan.com.my/category/gaya/agama/page/8/
✅ Done page 8 of agama. 

In [None]:
!ls /content


sample_data


In [None]:
from google.colab import files
files.download('utusan_full_scrape.csv')



FileNotFoundError: Cannot find file: utusan_full_scrape.csv