# NST News Scraper Notebook
This notebook contains code for scraping news articles from the New Straits Times (NST) website.

## 1. Installation Requirements

In [None]:
# Install required packages
!pip install selenium
!pip install webdriver-manager
!pip install beautifulsoup4

## 2. Import Libraries

In [None]:
# Import required libraries
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from datetime import datetime
import time
import csv
import os
import sys

## 3. WebDriver Setup Function

In [None]:
def setup_driver():
    print("Setting up Chrome WebDriver...")
    try:
        chrome_options = Options()
        chrome_options.add_argument("--headless=new")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.7049.96 Safari/537.36")
        
        # Get ChromeDriver with correct architecture
        driver_path = ChromeDriverManager().install()
        print(f"ChromeDriver path: {driver_path}")
        
        if not os.path.exists(driver_path):
            raise Exception(f"ChromeDriver not found at {driver_path}")
            
        # Verify the ChromeDriver file
        if not driver_path.endswith('.exe'):
            raise Exception(f"Invalid ChromeDriver file: {driver_path}")
            
        service = Service(executable_path=driver_path)
        driver = webdriver.Chrome(service=service, options=chrome_options)
        
        # Test the browser connection
        driver.get("about:blank")
        print("Chrome WebDriver setup successful!")
        return driver
    except Exception as e:
        print(f"Error setting up Chrome WebDriver: {str(e)}")
        print(f"Python version: {sys.version}")
        print(f"Selenium version: {webdriver.__version__}")
        print(f"Chrome version: 135.0.7049.96")
        print(f"System architecture: {sys.platform}")
        raise

## 4. Article Scraping Function

In [None]:
def scrape_nst_articles(base_url, start_page, end_page):
    driver = setup_driver()
    all_articles_data = []
    
    try:
        for page in range(start_page, end_page + 1):
            try:
                # Construct URL for each page
                url = f"{base_url}?page={page}"
                print(f"\nScraping page {page}...")
                
                print(f"Accessing URL: {url}")
                driver.get(url)
                
                # Wait for the content to load
                print("Waiting for content to load...")
                time.sleep(1)  # Initial wait
                
                # Wait for article elements to be present
                try:
                    WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.CLASS_NAME, "article-teaser"))
                    )
                except Exception as e:
                    print(f"Warning: Timeout waiting for articles to load: {str(e)}")
                
                # Get the page source after JavaScript has rendered
                page_source = driver.page_source
                soup = BeautifulSoup(page_source, 'html.parser')
                
                # Try different article selectors
                article_selectors = [
                    ("div", "article-teaser"),
                    ("div", "article-item"),
                    ("article", None),
                    ("div", "list-article")
                ]
                
                articles = []
                for tag, class_name in article_selectors:
                    if class_name:
                        articles = soup.find_all(tag, class_=class_name)
                    else:
                        articles = soup.find_all(tag)
                    
                    print(f"Found {len(articles)} articles with selector {tag}.{class_name if class_name else ''}")
                    if articles:
                        break
                
                # Process each article
                for i, article in enumerate(articles, 1):
                    try:
                        print(f"\nProcessing article {i} on page {page}:")
                        
                        # Extract headline
                        headline = ""
                        headline_selectors = [
                            ("h2", "article-title"),
                            ("h6", None),
                            ("h2", None),
                            ("h3", None),
                            ("a", "article-link")
                        ]
                        
                        for tag, class_name in headline_selectors:
                            elements = article.find_all(tag, class_=class_name) if class_name else article.find_all(tag)
                            for element in elements:
                                text = element.get_text(strip=True)
                                if text and len(text) > 20:  # Likely a headline if longer than 20 chars
                                    headline = text
                                    break
                            if headline:
                                break
                        
                        # Skip empty articles
                        if not headline:
                            print("Skipping empty article")
                            continue
                            
                        print(f"Headline: {headline}")
                        
                        # Extract date and section
                        date = ""
                        section = "Crime & Courts"  # Default section
                        date_selectors = [
                            ("time", "article-date"),
                            ("span", "created-date"),
                            ("div", "article-date"),
                            ("span", "date"),
                            ("div", "field-item"),
                            ("div", "article-meta")
                        ]
                        
                        for tag, class_name in date_selectors:
                            elements = article.find_all(tag, class_=class_name) if class_name else article.find_all(tag)
                            for element in elements:
                                date_text = element.get_text(strip=True)
                                if any(pattern in date_text.lower() for pattern in ['2024', '2023', '2025']):
                                    parts = date_text.split('@')
                                    if len(parts) > 1:
                                        section_part = parts[0].strip()
                                        if any(category in section_part.lower() for category in ['politics', 'crime', 'nation', 'business', 'sports']):
                                            section = section_part.split('Apr')[0].strip()
                                            date = parts[0].replace(section, '').strip() + ' @' + parts[1].strip()
                                        else:
                                            date = date_text
                                    else:
                                        date = date_text
                                    break
                            if date:
                                break
                        
                        print(f"Section: {section}")
                        print(f"Date: {date}")
                        
                        # Extract summary
                        summary = ""
                        summary_selectors = [
                            ("div", "article-teaser"),
                            ("div", "field-item"),
                            ("p", "article-summary"),
                            ("div", "summary"),
                            ("p", None)
                        ]
                        
                        for tag, class_name in summary_selectors:
                            elements = article.find_all(tag, class_=class_name) if class_name else article.find_all(tag)
                            for element in elements:
                                summary_text = element.get_text(strip=True)
                                if len(summary_text) > 50 and not any(keyword in summary_text.lower() for keyword in ['read more', 'click here']):
                                    if summary_text.startswith(('GEORGE TOWN:', 'IPOH:', 'TAPAH:', 'BALING:', 'ALOR STAR:')) or ':' in summary_text[:50]:
                                        summary = summary_text
                                        break
                            if summary:
                                break
                        
                        # If no summary found, try to get the first paragraph that looks like a summary
                        if not summary:
                            paragraphs = article.find_all('p')
                            for p in paragraphs:
                                text = p.get_text(strip=True)
                                if len(text) > 50 and not any(keyword in text.lower() for keyword in ['read more', 'click here']):
                                    summary = text
                                    break
                        
                        print(f"Summary: {summary[:100]}...")
                        
                        # Only append if we have at least a headline
                        if headline:
                            all_articles_data.append({
                                'Section': section,
                                'Date': date,
                                'Headline': headline,
                                'Summary': summary
                            })
                            print(f"Added article: {headline}")
                    
                    except Exception as e:
                        print(f"Error processing article {i} on page {page}: {str(e)}")
                        continue
                
                # Add a small delay between pages to avoid overwhelming the server
                if page < end_page:
                    time.sleep(2)
            
            except Exception as e:
                print(f"Error accessing website for page {page}: {str(e)}")
                continue
    
    finally:
        print("Closing browser...")
        driver.quit()
    
    return all_articles_data

## 5. Data Saving Function

In [None]:
def save_to_csv(articles_data, filename):
    if not articles_data:
        print("No articles to save.")
        return
    
    fieldnames = ['Section', 'Date', 'Headline', 'Summary']
    
    with open(filename, 'w', newline='', encoding='utf-8-sig') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(articles_data)
        print(f"Saved {len(articles_data)} articles to {filename}")

## 6. Main Execution

In [None]:
# Base URL for Northern region
base_url = 'https://www.nst.com.my/news/crime-courts'

# Set the page range here
start_page = 1201  # Change this to your desired starting page
end_page = 1300    # Change this to your desired ending page

try:
    # Scrape the articles from multiple pages
    articles_data = scrape_nst_articles(base_url, start_page, end_page)
    
    # Generate filename with timestamp
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f'nst_articles_{timestamp}.csv'
    
    # Save to CSV
    save_to_csv(articles_data, filename)
    
    # Print the results
    if articles_data:
        print(f"\nScraped {len(articles_data)} articles from pages {start_page} to {end_page}:")
        for article in articles_data:
            print(f"\nSection: {article['Section']}")
            print(f"Date: {article['Date']}")
            print(f"Headline: {article['Headline']}")
            print(f"Summary: {article['Summary'][:100]}...")
    else:
        print("\nNo articles were found.")
    
except Exception as e:
    print(f"An error occurred: {e}")

## Usage Instructions

1. Run the installation block first to install required packages
2. Run the import block to import necessary libraries
3. Run the function definition blocks (WebDriver Setup, Article Scraping, and Data Saving)
4. Finally, run the main execution block to start the scraping process

The script will:
- Scrape articles from the specified page range
- Save the results to a CSV file with timestamp
- Print progress and results to the console

Note: Make sure you have Chrome browser installed on your system before running the script.