# Web Scrapping


In [None]:
# Install necessary Python packages
!pip install selenium webdriver_manager chromedriver_autoinstaller

# Install required Linux packages
!apt-get update -y
!apt-get install -y wget unzip

# Download Chrome and ChromeDriver for headless use in Colab
!wget -q https://storage.googleapis.com/chrome-for-testing-public/135.0.7049.96/linux64/chrome-linux64.zip
!wget -q https://storage.googleapis.com/chrome-for-testing-public/135.0.7049.96/linux64/chromedriver-linux64.zip

# Extract the downloaded zip files
!unzip chrome-linux64.zip
!unzip chromedriver-linux64.zip


Collecting selenium
  Downloading selenium-4.32.0-py3-none-any.whl.metadata (7.5 kB)
Collecting webdriver_manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting chromedriver_autoinstaller
  Downloading chromedriver_autoinstaller-0.6.4-py3-none-any.whl.metadata (2.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting python-dotenv (from webdriver_manager)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.32.0-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m

In [None]:
!uname -a

Linux 1d2e6ff9745e 6.1.123+ #1 SMP PREEMPT_DYNAMIC Sun Mar 30 16:01:29 UTC 2025 x86_64 x86_64 x86_64 GNU/Linux


In [None]:
# Set the system PATH to include Chrome and ChromeDriver binaries
import os
os.environ['PATH'] += f":{os.getcwd()}/chrome-linux64:{os.getcwd()}/chromedriver-linux64"

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime

from urllib.parse import urljoin

import time
import csv
import os
import sys


In [None]:
import os
import csv
import time
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Configuration
BASE_URL = 'https://www.nst.com.my'
TARGET_PATH = '/sports/others'  # Updated TARGET_PATH

def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    driver_path = os.path.join(os.getcwd(), 'chromedriver-linux64', 'chromedriver')
    service = Service(executable_path=driver_path)
    return webdriver.Chrome(service=service, options=chrome_options)


def scrape_nst_articles(url, driver):
    articles_data = []

    try:
        print(f"Accessing URL: {url}")
        driver.get(url)

        # Wait for articles to load
        print("Waiting for content to load...")
        WebDriverWait(driver, 5).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, "article-teaser"))
        )

        articles = driver.find_elements(By.CLASS_NAME, "article-teaser")
        print(f"Found {len(articles)} articles on this page")

        for i, article in enumerate(articles, 1):
            try:
                headline_element = article.find_element(By.CLASS_NAME, "field-title")
                headline = headline_element.text.strip()

                # Extract Summary
                try:
                    summary_element = article.find_element(By.CLASS_NAME, "article-teaser")
                    summary = summary_element.text.strip()
                except:
                    summary = ""

                # Extract Section and Date
                try:
                    meta_div = article.find_element(By.CLASS_NAME, "article-meta")
                    # Get section
                    try:
                        section_element = meta_div.find_element(By.CLASS_NAME, "field-category")
                        section = section_element.text.strip()
                    except:
                        section = "Unknown"

                    # Get date
                    try:
                        date_element = meta_div.find_element(By.CLASS_NAME, "created-ago")
                        date = date_element.text.strip()
                    except:
                        date = "Unknown"

                except:
                    section = "Unknown"
                    date = "Unknown"

                if headline and date:
                    articles_data.append({
                        'Section': section,
                        'Date': date,
                        'Headline': headline,
                        'Summary': summary,
                    })

            except Exception as e:
                print(f"Error processing article {i}: {e}")
                continue

    except Exception as e:
        print(f"Error accessing website: {e}")

    return articles_data



def save_to_csv(articles_data, filename):
    if not articles_data:
        print("No articles to save.")
        return

    fieldnames = ['Section', 'Date', 'Headline', 'Summary']


    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(articles_data)
        print(f"Saved {len(articles_data)} articles to {filename}")

def main():
    all_articles = []
    start_page = 1  # Resetting to page 1 for the new section
    target_article_count = 6000
    max_pages = 8000  # Increased max pages to try and reach the target

    try:
        # Setup WebDriver once for all pages
        driver = setup_driver()
        full_path = urljoin(BASE_URL, TARGET_PATH)

        for page in range(start_page, max_pages):
            print(f"\nScraping page {page} for Sports (Others)...")
            page_url = f"{full_path}?page={page}"
            articles_data = scrape_nst_articles(page_url, driver)
            if not articles_data:
                print(f"No articles found on page {page} or error accessing page. Stopping pagination.")
                break
            all_articles.extend(articles_data)
            print(f"Successfully scraped {len(articles_data)} articles from page {page}. Total: {len(all_articles)}")
            time.sleep(0.5)

            if len(all_articles) >= target_article_count:
                print(f"\nReached the target of {target_article_count} articles. Stopping scraping.")
                break

        filename = f'nst_articles_sports_others_approx_{len(all_articles)}_articles.csv'
        save_to_csv(all_articles, filename)

        if all_articles:
            print(f"\nTotal articles scraped: {len(all_articles)}")
            print(f"Articles saved to: {filename}")
        else:
            print(f"\nNo articles were found for Sports (Others) section.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
    finally:
        if driver:
            print("Closing browser...")
            driver.quit()

if __name__ == "__main__":
    main()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
#9 0x5744b3053433 <unknown>
#10 0x5744b301fea3 <unknown>
#11 0x5744b3020b01 <unknown>
#12 0x5744b34f0b6b <unknown>
#13 0x5744b34f4a51 <unknown>
#14 0x5744b34d7c62 <unknown>
#15 0x5744b34f55c4 <unknown>
#16 0x5744b34bbf1f <unknown>
#17 0x5744b3519dc8 <unknown>
#18 0x5744b3519fa6 <unknown>
#19 0x5744b352ab66 <unknown>
#20 0x7fde601faac3 <unknown>

Error processing article 8: Message: no such element: Unable to locate element: {"method":"css selector","selector":".field-title"}
  (Session info: chrome=135.0.7049.96); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
#0 0x5744b352bd1a <unknown>
#1 0x5744b2fdc5f0 <unknown>
#2 0x5744b302da33 <unknown>
#3 0x5744b302dc21 <unknown>
#4 0x5744b30215c6 <unknown>
#5 0x5744b305368d <unknown>
#6 0x5744b30214ba <unknown>
#7 0x5744b305382e <unknown>
#8 0x5744b3079660 <unknown>
#9 0x5