# Web scraping
### Tiew Chuan Shen

In [None]:
# Install necessary Python packages
!pip install selenium webdriver_manager chromedriver_autoinstaller

# Install required Linux packages
!apt-get update -y
!apt-get install -y wget unzip

# Download Chrome and ChromeDriver for headless use in Colab
!wget -q https://storage.googleapis.com/chrome-for-testing-public/135.0.7049.96/linux64/chrome-linux64.zip
!wget -q https://storage.googleapis.com/chrome-for-testing-public/135.0.7049.96/linux64/chromedriver-linux64.zip

# Extract the downloaded zip files
!unzip chrome-linux64.zip
!unzip chromedriver-linux64.zip


Collecting selenium
  Downloading selenium-4.32.0-py3-none-any.whl.metadata (7.5 kB)
Collecting webdriver_manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting chromedriver_autoinstaller
  Downloading chromedriver_autoinstaller-0.6.4-py3-none-any.whl.metadata (2.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting python-dotenv (from webdriver_manager)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.32.0-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!uname -a

Linux f4fe5c5ebbfb 6.1.123+ #1 SMP PREEMPT_DYNAMIC Sun Mar 30 16:01:29 UTC 2025 x86_64 x86_64 x86_64 GNU/Linux


In [None]:
# Set the system PATH to include Chrome and ChromeDriver binaries
import os
os.environ['PATH'] += f":{os.getcwd()}/chrome-linux64:{os.getcwd()}/chromedriver-linux64"

# limit page

https://www.nst.com.my/news/nst-viral?page=81

https://www.nst.com.my/news/crime-courts?page=1518

https://www.nst.com.my/news/nation?page=8994

https://www.nst.com.my/news/government-public-policy?page=56

https://www.nst.com.my/news/politics?page=1032


# News Category

1st trial 20 row: 20 second

2nd trial 26 000 row (1,1300): start 2.30pm end 6pm

1st trial (1,1300): start 10.30pm to 3.15am

2nd trial (2501,3000) 11.38pm - 12.50am, failed to save

3nd trial (2501,2750) 12.50am - 1.42am

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime

from urllib.parse import urljoin

import time
import csv
import os
import sys


In [None]:

# Configuration
BASE_URL = 'https://www.nst.com.my'
def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    driver_path = os.path.join(os.getcwd(), 'chromedriver-linux64', 'chromedriver')
    service = Service(executable_path=driver_path)
    return webdriver.Chrome(service=service, options=chrome_options)


def scrape_nst_articles(url, driver):
    articles_data = []

    try:
        print(f"Accessing URL: {url}")
        driver.get(url)

        # Wait for articles to load
        print("Waiting for content to load...")
        WebDriverWait(driver, 5).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, "article-teaser"))
        )

        articles = driver.find_elements(By.CLASS_NAME, "article-teaser")
        print(f"Found {len(articles)} articles")

        for i, article in enumerate(articles, 1):
            try:
                headline_element = article.find_element(By.CLASS_NAME, "field-title")
                headline = headline_element.text.strip()

                # Extract Summary
                try:
                    summary_element = article.find_element(By.CLASS_NAME, "article-teaser")
                    summary = summary_element.text.strip()
                except:
                    summary = ""

                # Extract Section and Date
                try:
                    meta_div = driver.find_element(By.CLASS_NAME, "article-meta")
                    # Get section
                    try:
                        section_element = meta_div.find_element(By.CLASS_NAME, "field-category")
                        section = section_element.text.strip()
                    except:
                        section = "Unknown"

                    # Get date
                    try:
                        date_element = meta_div.find_element(By.CLASS_NAME, "created-ago")
                        date = date_element.text.strip()
                    except:
                        date = "Unknown"

                except:
                    section = "Unknown"
                    date = "Unknown"

                if headline and date:
                    articles_data.append({
                        'Section': section,
                        'Date': date,
                        'Headline': headline,
                        'Summary': summary,
                    })

            except Exception as e:

                continue

    except Exception as e:
        print(f"Error accessing website: {e}")

    return articles_data



def save_to_csv(articles_data, filename):
    if not articles_data:
        print("No articles to save.")
        return

    fieldnames = ['Section', 'Date', 'Headline', 'Summary']


    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(articles_data)
        print(f"Saved {len(articles_data)} articles to {filename}")

def main():

    news_cat = ['nation']

    all_articles = []

    try:
        # Setup WebDriver once for all pages
        driver = setup_driver()
        for cat in news_cat:
            news_path = f'/news/{cat}'
            full_path = urljoin(BASE_URL, news_path)

            for page in range(2501,2750):
                print(f"\nScraping page {page} for {cat.capitalize()}...")
                page_url = f"{full_path}?page={page}"
                articles_data = scrape_nst_articles(page_url, driver)
                if not articles_data:
                    print(f"No articles found on page {page} or error accessing page. Stopping pagination for {cat}.")
                    continue
                all_articles.extend(articles_data)
                print(f"Successfully scraped {len(articles_data)} articles from page {page}")
                time.sleep(0.5)


        filename = f'nst_articles_nation_page_1_1300.csv'
        save_to_csv(all_articles, filename)

        if all_articles:
            print(f"\nTotal articles scraped: {len(all_articles)}")
            print(f"Articles saved to: {filename}")
        else:
            print(f"\nNo articles were found.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
    finally:
        if driver:
            print("Closing browser...")
            driver.quit()

if __name__ == "__main__":
    main()


Scraping page 2501 for Nation...
Accessing URL: https://www.nst.com.my/news/nation?page=2501
Waiting for content to load...
Found 40 articles
Successfully scraped 20 articles from page 2501

Scraping page 2502 for Nation...
Accessing URL: https://www.nst.com.my/news/nation?page=2502
Waiting for content to load...
Found 40 articles
Successfully scraped 20 articles from page 2502

Scraping page 2503 for Nation...
Accessing URL: https://www.nst.com.my/news/nation?page=2503
Waiting for content to load...
Found 40 articles
Successfully scraped 20 articles from page 2503

Scraping page 2504 for Nation...
Accessing URL: https://www.nst.com.my/news/nation?page=2504
Waiting for content to load...
Found 40 articles
Successfully scraped 20 articles from page 2504

Scraping page 2505 for Nation...
Accessing URL: https://www.nst.com.my/news/nation?page=2505
Waiting for content to load...
Found 40 articles
Successfully scraped 20 articles from page 2505

Scraping page 2506 for Nation...
Accessing U

In [None]:

# Configuration
BASE_URL = 'https://www.nst.com.my'

def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    driver_path = os.path.join(os.getcwd(), 'chromedriver-linux64', 'chromedriver')
    service = Service(executable_path=driver_path)
    return webdriver.Chrome(service=service, options=chrome_options)


def scrape_nst_articles(url, driver):
    articles_data = []

    try:
        print(f"Accessing URL: {url}")
        driver.get(url)

        # Wait for articles to load
        print("Waiting for content to load...")
        WebDriverWait(driver, 5).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, "article-teaser"))
        )

        articles = driver.find_elements(By.CLASS_NAME, "article-teaser")
        print(f"Found {len(articles)} articles")

        for i, article in enumerate(articles, 1):
            try:
                headline_element = article.find_element(By.CLASS_NAME, "field-title")
                headline = headline_element.text.strip()

                # Extract Summary
                try:
                    summary_element = article.find_element(By.CLASS_NAME, "article-teaser")
                    summary = summary_element.text.strip()
                except:
                    summary = ""

                # Extract Section and Date
                try:
                    meta_div = driver.find_element(By.CLASS_NAME, "article-meta")
                    # Get section
                    try:
                        section_element = meta_div.find_element(By.CLASS_NAME, "field-category")
                        section = section_element.text.strip()
                    except:
                        section = "Unknown"

                    # Get date
                    try:
                        date_element = meta_div.find_element(By.CLASS_NAME, "created-ago")
                        date = date_element.text.strip()
                    except:
                        date = "Unknown"

                except:
                    section = "Unknown"
                    date = "Unknown"

                if headline and date:
                    articles_data.append({
                        'Section': section,
                        'Date': date,
                        'Headline': headline,
                        'Summary': summary,
                    })

            except Exception as e:

                continue

    except Exception as e:
        print(f"Error accessing website: {e}")

    return articles_data



def save_to_csv(articles_data, filename):
    if not articles_data:
        print("No articles to save.")
        return

    fieldnames = ['Section', 'Date', 'Headline', 'Summary']


    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(articles_data)
        print(f"Saved {len(articles_data)} articles to {filename}")

def main():

    news_cat = ['nation']

    all_articles = []

    try:
        # Setup WebDriver once for all pages
        driver = setup_driver()
        for cat in news_cat:
            news_path = f'/news/{cat}'
            full_path = urljoin(BASE_URL, news_path)

            for page in range(1,10):
                print(f"\nScraping page {page} for {cat.capitalize()}...")
                page_url = f"{full_path}?page={page}"
                articles_data = scrape_nst_articles(page_url, driver)
                if not articles_data:
                    print(f"No articles found on page {page} or error accessing page. Stopping pagination for {cat}.")
                    continue
                all_articles.extend(articles_data)
                print(f"Successfully scraped {len(articles_data)} articles from page {page}")
                time.sleep(0.5)


        filename = f'nst_articles_nation_page_1_1300.csv'
        save_to_csv(all_articles, filename)

        if all_articles:
            print(f"\nTotal articles scraped: {len(all_articles)}")
            print(f"Articles saved to: {filename}")
        else:
            print(f"\nNo articles were found.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
    finally:
        if driver:
            print("Closing browser...")
            driver.quit()

if __name__ == "__main__":
    main()


Scraping page 1 for Nation...
Accessing URL: https://www.nst.com.my/news/nation?page=1
Waiting for content to load...
Found 40 articles
Successfully scraped 20 articles from page 1

Scraping page 2 for Nation...
Accessing URL: https://www.nst.com.my/news/nation?page=2
Waiting for content to load...
Found 40 articles
Successfully scraped 20 articles from page 2

Scraping page 3 for Nation...
Accessing URL: https://www.nst.com.my/news/nation?page=3
Waiting for content to load...
Found 40 articles
Successfully scraped 20 articles from page 3

Scraping page 4 for Nation...
Accessing URL: https://www.nst.com.my/news/nation?page=4
Waiting for content to load...
Found 40 articles
Successfully scraped 20 articles from page 4

Scraping page 5 for Nation...
Accessing URL: https://www.nst.com.my/news/nation?page=5
Waiting for content to load...
Found 40 articles
Successfully scraped 20 articles from page 5

Scraping page 6 for Nation...
Accessing URL: https://www.nst.com.my/news/nation?page=6
Wa

# Performance Comparison

In [None]:
import os, csv, time, psutil
from multiprocessing import Pool, cpu_count

In [None]:

BASE_URL = 'https://www.nst.com.my'
def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--disable-blink-features=BlockCredentialedSubresources")
    chrome_options.add_argument("--blink-settings=imagesEnabled=false")
    chrome_options.add_argument("--disable-extensions")
    driver_path = os.path.join(os.getcwd(), 'chromedriver-linux64', 'chromedriver')
    service = Service(executable_path=driver_path)
    return webdriver.Chrome(service=service, options=chrome_options)
def scrape_nst_articles(url, driver):
    articles_data = []
    try:
        driver.get(url)
        WebDriverWait(driver, 5).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, "article-teaser"))
        )
        articles = driver.find_elements(By.CLASS_NAME, "article-teaser")

        for article in articles:
            try:
                headline = article.find_element(By.CLASS_NAME, "field-title").text.strip()
                section = article.find_element(By.CLASS_NAME, "field-category").text.strip()
                date = article.find_element(By.CLASS_NAME, "created-ago").text.strip()
                summary = article.text.strip()

                articles_data.append({
                    'Section': section,
                    'Date': date,
                    'Headline': headline,
                    'Summary': summary,
                })
            except:
                continue
    except:
        pass
    return articles_data

def save_to_csv(data, filename):
    if not data: return
    fieldnames = ['Section', 'Date', 'Headline', 'Summary']
    with open(filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(data)

def main_basic():
    news_cat = ['nation']
    all_articles = []
    start_time = time.time()
    process = psutil.Process(os.getpid())

    driver = setup_driver()
    for cat in news_cat:
        url_base = urljoin(BASE_URL, f'/news/{cat}')
        for page in range(1, 10):  # Example: 20 pages only
            page_url = f"{url_base}?page={page}"
            articles = scrape_nst_articles(page_url, driver)
            all_articles.extend(articles)
            time.sleep(0.2)

    driver.quit()

    filename = f'nst_articles_basic.csv'
    save_to_csv(all_articles, filename)

    end_time = time.time()
    print(f"Basic Scraper Results:")
    print(f"Total time: {end_time - start_time:.2f} seconds")
    print(f"Memory used: {process.memory_info().rss / 1024 ** 2:.2f} MB")
    print(f"Records processed: {len(all_articles)}")
    print(f"Throughput: {len(all_articles) / (end_time - start_time):.2f} records/sec")

if __name__ == "__main__":
    main_basic()




Basic Scraper Results:
Total time: 138.16 seconds
Memory used: 215.01 MB
Records processed: 180
Throughput: 1.30 records/sec


In [None]:
import multiprocessing
print(multiprocessing.cpu_count())

2


In [None]:
import pandas as pd

# Load your data
df = pd.read_csv('nst_articles_nation_page_1_1300.csv')

# Find duplicate rows based on specific columns
duplicates = df[df.duplicated(subset=['Section', 'Date', 'Headline', 'Summary'], keep=False)]

# Show duplicate rows
print("Duplicate entries:")
print(duplicates)

# Count how many duplicates
duplicate_count = duplicates.shape[0]
print(f"\nTotal number of duplicate rows: {duplicate_count}")

# Drop duplicates
df_no_duplicates = df.drop_duplicates(subset=['Section', 'Date', 'Headline', 'Summary'])

# Count rows after dropping duplicates
final_count = df_no_duplicates.shape[0]
print(f"\nTotal number of rows after dropping duplicates: {final_count}")


Duplicate entries:
      Section                    Date  \
742    NATION  Apr 16, 2025 @ 11:00am   
743    NATION  Apr 16, 2025 @ 11:00am   
1233   NATION   Apr 10, 2025 @ 6:59am   
1235   NATION   Apr 10, 2025 @ 6:59am   
1409   NATION   Apr 7, 2025 @ 10:57am   
1411   NATION   Apr 7, 2025 @ 10:57am   
6424   NATION   Jan 30, 2025 @ 1:01am   
6425   NATION   Jan 30, 2025 @ 1:01am   
10754  NATION   Nov 28, 2024 @ 1:24am   
10755  NATION   Nov 28, 2024 @ 1:24am   
10940  NATION   Nov 25, 2024 @ 8:33am   
10942  NATION   Nov 25, 2024 @ 8:33am   
14642  NATION    Oct 7, 2024 @ 8:14am   
14649  NATION    Oct 7, 2024 @ 8:14am   
18056  NATION   Aug 14, 2024 @ 7:45am   
18058  NATION   Aug 14, 2024 @ 7:45am   
18151  NATION   Aug 12, 2024 @ 3:09pm   
18154  NATION   Aug 12, 2024 @ 3:09pm   
19926  NATION   Jul 15, 2024 @ 4:30am   
19928  NATION   Jul 15, 2024 @ 4:30am   
21501  NATION   Jun 18, 2024 @ 6:19am   
21502  NATION   Jun 18, 2024 @ 6:19am   
21829  NATION   Jun 12, 2024 @ 8:03am 