# Web scraping using selenium
## Group Members

| No. | Name             |
|-----|------------------|
| 1   | JOSEPH LAU YEO KAI A22EC0055         |
| 2   | VINESH A/L VIJAYA KUMAR A22EC0290   |
| 3   | VINESH A/L VIJAYA KUMAR A22EC0290    |
| 4   | NUR FARAH ADIBAH BINTI IDRIS A22EC0245       |




In [None]:
# Install necessary Python packages
!pip install selenium webdriver_manager chromedriver_autoinstaller

# Install required Linux packages
!apt-get update -y
!apt-get install -y wget unzip

# Download Chrome and ChromeDriver for headless use in Colab
!wget -q https://storage.googleapis.com/chrome-for-testing-public/135.0.7049.96/linux64/chrome-linux64.zip
!wget -q https://storage.googleapis.com/chrome-for-testing-public/135.0.7049.96/linux64/chromedriver-linux64.zip

# Extract the downloaded zip files
!unzip chrome-linux64.zip
!unzip chromedriver-linux64.zip


Collecting selenium
  Downloading selenium-4.32.0-py3-none-any.whl.metadata (7.5 kB)
Collecting webdriver_manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting chromedriver_autoinstaller
  Downloading chromedriver_autoinstaller-0.6.4-py3-none-any.whl.metadata (2.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting python-dotenv (from webdriver_manager)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.32.0-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m

In [None]:
!uname -a

Linux 0aadc46e54af 6.1.123+ #1 SMP PREEMPT_DYNAMIC Sun Mar 30 16:01:29 UTC 2025 x86_64 x86_64 x86_64 GNU/Linux


In [None]:
# Set the system PATH to include Chrome and ChromeDriver binaries
import os
os.environ['PATH'] += f":{os.getcwd()}/chrome-linux64:{os.getcwd()}/chromedriver-linux64"

# limit page
|   | Category                   | URL                                                                                                                          |
| - | -------------------------- | ---------------------------------------------------------------------------------------------------------------------------- |
| 0 | NST Viral                  | [https://www.nst.com.my/news/nst-viral?page=81](https://www.nst.com.my/news/nst-viral?page=81)                               |
| 1 | Crime & Courts             | [https://www.nst.com.my/news/crime-courts?page=1518](https://www.nst.com.my/news/crime-courts?page=1518)                     |
| 2 | Nation                     | [https://www.nst.com.my/news/nation?page=8994](https://www.nst.com.my/news/nation?page=8994)                                 |
| 3 | Government & Public Policy | [https://www.nst.com.my/news/government-public-policy?page=56](https://www.nst.com.my/news/government-public-policy?page=56) |
| 4 | Politics                   | [https://www.nst.com.my/news/politics?page=1032](https://www.nst.com.my/news/politics?page=1032)                             |



# News Category


In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime

from urllib.parse import urljoin

import time
import csv
import os
import sys


In [None]:

# Configuration
BASE_URL = 'https://www.nst.com.my'
def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    driver_path = os.path.join(os.getcwd(), 'chromedriver-linux64', 'chromedriver')
    service = Service(executable_path=driver_path)
    return webdriver.Chrome(service=service, options=chrome_options)


def scrape_nst_articles(url, driver):
    articles_data = []

    try:
        print(f"Accessing URL: {url}")
        driver.get(url)

        # Wait for articles to load
        print("Waiting for content to load...")
        WebDriverWait(driver, 5).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, "article-teaser"))
        )

        articles = driver.find_elements(By.CLASS_NAME, "article-teaser")
        print(f"Found {len(articles)} articles")

        for i, article in enumerate(articles, 1):
            try:
                headline_element = article.find_element(By.CLASS_NAME, "field-title")
                headline = headline_element.text.strip()

                # Extract Summary
                try:
                    summary_element = article.find_element(By.CLASS_NAME, "article-teaser")
                    summary = summary_element.text.strip()
                except:
                    summary = ""

                # Extract Section and Date
                try:
                    meta_div = driver.find_element(By.CLASS_NAME, "article-meta")
                    # Get section
                    try:
                        section_element = meta_div.find_element(By.CLASS_NAME, "field-category")
                        section = section_element.text.strip()
                    except:
                        section = "Unknown"

                    # Get date
                    try:
                        date_element = meta_div.find_element(By.CLASS_NAME, "created-ago")
                        date = date_element.text.strip()
                    except:
                        date = "Unknown"

                except:
                    section = "Unknown"
                    date = "Unknown"

                if headline and date:
                    articles_data.append({
                        'Section': section,
                        'Date': date,
                        'Headline': headline,
                        'Summary': summary,
                    })

            except Exception as e:

                continue

    except Exception as e:
        print(f"Error accessing website: {e}")

    return articles_data



def save_to_csv(articles_data, filename):
    if not articles_data:
        print("No articles to save.")
        return

    fieldnames = ['Section', 'Date', 'Headline', 'Summary']


    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(articles_data)
        print(f"Saved {len(articles_data)} articles to {filename}")

def main():

    news_cat = ['nation']

    all_articles = []

    try:
        # Setup WebDriver once for all pages
        driver = setup_driver()
        for cat in news_cat:
            news_path = f'/news/{cat}'
            full_path = urljoin(BASE_URL, news_path)

            for page in range(1,1320):
                print(f"\nScraping page {page} for {cat.capitalize()}...")
                page_url = f"{full_path}?page={page}"
                articles_data = scrape_nst_articles(page_url, driver)
                if not articles_data:
                    print(f"No articles found on page {page} or error accessing page. Stopping pagination for {cat}.")
                    continue
                all_articles.extend(articles_data)
                print(f"Successfully scraped {len(articles_data)} articles from page {page}")
                time.sleep(0.5)


        filename = f'nst_articles_nation_page_1_1300.csv'
        save_to_csv(all_articles, filename)

        if all_articles:
            print(f"\nTotal articles scraped: {len(all_articles)}")
            print(f"Articles saved to: {filename}")
        else:
            print(f"\nNo articles were found.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
    finally:
        if driver:
            print("Closing browser...")
            driver.quit()

if __name__ == "__main__":
    main()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Waiting for content to load...
Found 40 articles
Successfully scraped 20 articles from page 487

Scraping page 488 for Nation...
Accessing URL: https://www.nst.com.my/news/nation?page=488
Waiting for content to load...
Found 40 articles
Successfully scraped 20 articles from page 488

Scraping page 489 for Nation...
Accessing URL: https://www.nst.com.my/news/nation?page=489
Waiting for content to load...
Found 40 articles
Successfully scraped 20 articles from page 489

Scraping page 490 for Nation...
Accessing URL: https://www.nst.com.my/news/nation?page=490
Waiting for content to load...
Found 40 articles
Successfully scraped 20 articles from page 490

Scraping page 491 for Nation...
Accessing URL: https://www.nst.com.my/news/nation?page=491
Waiting for content to load...
Found 40 articles
Successfully scraped 20 articles from page 491

Scraping page 492 for Nation...
Accessing URL: https://www.nst.com.my/news/nation?page

In [None]:

# Configuration
BASE_URL = 'https://www.nst.com.my'

def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    driver_path = os.path.join(os.getcwd(), 'chromedriver-linux64', 'chromedriver')
    service = Service(executable_path=driver_path)
    return webdriver.Chrome(service=service, options=chrome_options)


def scrape_nst_articles(url, driver):
    articles_data = []

    try:
        print(f"Accessing URL: {url}")
        driver.get(url)

        # Wait for articles to load
        print("Waiting for content to load...")
        WebDriverWait(driver, 5).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, "article-teaser"))
        )

        articles = driver.find_elements(By.CLASS_NAME, "article-teaser")
        print(f"Found {len(articles)} articles")

        for i, article in enumerate(articles, 1):
            try:
                headline_element = article.find_element(By.CLASS_NAME, "field-title")
                headline = headline_element.text.strip()

                # Extract Summary
                try:
                    summary_element = article.find_element(By.CLASS_NAME, "article-teaser")
                    summary = summary_element.text.strip()
                except:
                    summary = ""

                # Extract Section and Date
                try:
                    meta_div = driver.find_element(By.CLASS_NAME, "article-meta")
                    # Get section
                    try:
                        section_element = meta_div.find_element(By.CLASS_NAME, "field-category")
                        section = section_element.text.strip()
                    except:
                        section = "Unknown"

                    # Get date
                    try:
                        date_element = meta_div.find_element(By.CLASS_NAME, "created-ago")
                        date = date_element.text.strip()
                    except:
                        date = "Unknown"

                except:
                    section = "Unknown"
                    date = "Unknown"

                if headline and date:
                    articles_data.append({
                        'Section': section,
                        'Date': date,
                        'Headline': headline,
                        'Summary': summary,
                    })

            except Exception as e:

                continue

    except Exception as e:
        print(f"Error accessing website: {e}")

    return articles_data



def save_to_csv(articles_data, filename):
    if not articles_data:
        print("No articles to save.")
        return

    fieldnames = ['Section', 'Date', 'Headline', 'Summary']


    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(articles_data)
        print(f"Saved {len(articles_data)} articles to {filename}")

def main():

    news_cat = ['nation']

    all_articles = []

    try:
        # Setup WebDriver once for all pages
        driver = setup_driver()
        for cat in news_cat:
            news_path = f'/news/{cat}'
            full_path = urljoin(BASE_URL, news_path)

            for page in range(2751,4000):
                print(f"\nScraping page {page} for {cat.capitalize()}...")
                page_url = f"{full_path}?page={page}"
                articles_data = scrape_nst_articles(page_url, driver)
                if not articles_data:
                    print(f"No articles found on page {page} or error accessing page. Stopping pagination for {cat}.")
                    continue
                all_articles.extend(articles_data)
                print(f"Successfully scraped {len(articles_data)} articles from page {page}")
                time.sleep(0.5)


        filename = f'nst_articles_nation_page_2751_4000.csv'
        save_to_csv(all_articles, filename)

        if all_articles:
            print(f"\nTotal articles scraped: {len(all_articles)}")
            print(f"Articles saved to: {filename}")
        else:
            print(f"\nNo articles were found.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
    finally:
        if driver:
            print("Closing browser...")
            driver.quit()

if __name__ == "__main__":
    main()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Waiting for content to load...
Found 40 articles
Successfully scraped 20 articles from page 3167

Scraping page 3168 for Nation...
Accessing URL: https://www.nst.com.my/news/nation?page=3168
Waiting for content to load...
Found 40 articles
Successfully scraped 20 articles from page 3168

Scraping page 3169 for Nation...
Accessing URL: https://www.nst.com.my/news/nation?page=3169
Waiting for content to load...
Found 40 articles
Successfully scraped 20 articles from page 3169

Scraping page 3170 for Nation...
Accessing URL: https://www.nst.com.my/news/nation?page=3170
Waiting for content to load...
Found 40 articles
Successfully scraped 20 articles from page 3170

Scraping page 3171 for Nation...
Accessing URL: https://www.nst.com.my/news/nation?page=3171
Waiting for content to load...
Found 40 articles
Successfully scraped 20 articles from page 3171

Scraping page 3172 for Nation...
Accessing URL: https://www.nst.com.my/ne