In [57]:
!pip install pandas
!pip install requests
!pip install beautifulsoup4
!pip install lxml
!pip install html5lib
!pip install selenium webdriver-manager

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [58]:
import pandas as pd
import requests
import logging
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

import time
import csv
from datetime import datetime

In [59]:
# Configure logging
logging.basicConfig(
    level=logging.INFO, 
    format='%(asctime)s - %(levelname)s: %(message)s',
    handlers=[
        logging.FileHandler('straits_times_scraper_debug.log'),
        logging.StreamHandler()
    ]
)

In [60]:
def setup_webdriver():
    """
    Set up Selenium WebDriver with Chrome and comprehensive error handling
    
    Returns:
        webdriver: Configured Chrome WebDriver
    """
    try:
        # Chrome options
        chrome_options = Options()
        
        # Remove headless option as requested
        # chrome_options.add_argument('--headless')  # Commented out
        
        # Additional options to improve stability
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--window-size=1920,1080')
        chrome_options.add_argument('--ignore-certificate-errors')
        chrome_options.add_argument('--allow-running-insecure-content')
        
        # User agent to mimic browser
        chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
        
        # Attempt to install and configure WebDriver
        service = Service(ChromeDriverManager().install())
        
        # Create WebDriver with extensive error handling
        driver = webdriver.Chrome(service=service, options=chrome_options)
        
        # Additional configurations
        driver.set_page_load_timeout(50)  # 30 seconds page load timeout
        
        logging.info("WebDriver successfully initialized")
        return driver
    
    except Exception as e:
        logging.error(f"Failed to initialize WebDriver: {e}")
        raise

In [61]:
def scrape_cnn_search_results(driver, url):
    """
    Scrape article details from CNN search results page using Selenium
    
    Args:
        driver (webdriver): Selenium WebDriver
        url (str): URL of CNN search results page
    
    Returns:
        list: List of dictionaries containing article details
    """
    try:
        # Navigate to the URL
        driver.get(url)
        
        # Wait for search results to load
        WebDriverWait(driver, 50).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.container__link'))
        )
        
        # Small delay to ensure page is fully loaded
        time.sleep(2)
        
        # Find all article links
        article_links = driver.find_elements(By.CSS_SELECTOR, '.container__link')
        
        # Extract article details
        articles = []
        for link in article_links:
            try:
                # Extract title
                title_elem = link.find_element(By.CSS_SELECTOR, '.container__headline-text')
                title = title_elem.text.strip()
                
                # Extract URL
                article_url = link.get_attribute('href')
                
                # Optional: Extract date if available
                try:
                    date_elem = link.find_element(By.CSS_SELECTOR, '.container__date')
                    article_date = date_elem.text.strip()
                except:
                    article_date = 'N/A'
                
                # Optional: Extract summary if available
                try:
                    summary_elem = link.find_element(By.CSS_SELECTOR, '.container__description')
                    summary = summary_elem.text.strip()
                except:
                    summary = 'N/A'
                
                # Create article dictionary
                article = {
                    'Title': title,
                    'URL': article_url,
                    'Date': article_date,
                    'Summary': summary
                }
                
                articles.append(article)
            
            except Exception as elem_error:
                logging.warning(f"Error extracting individual article: {elem_error}")
        
        # Generate a timestamped filename
        output_filename = f'cnn_articles_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
        
        # Save to CSV
        articles_df = pd.DataFrame(articles)
        articles_df.to_csv(output_filename, index=False, encoding='utf-8')
        
        logging.info(f"Saved {len(articles)} articles to {output_filename}")
        
        return articles
    
    except Exception as error:
        logging.error(f"Error scraping URL {url}: {error}")
        return []

In [62]:
def main():
    """
    Main function to process URLs and scrape search results
    """
    # Set up WebDriver
    driver = setup_webdriver()
    
    try:
        # Read URLs from CSV
        urls_df = pd.read_csv('CNN_Organised_Crime_URLs.csv')
        
        # Will store articles from all URLs
        all_articles = []
        
        # Iterate through URLs
        for index, row in urls_df.iterrows():
            url = row['URL']
            logging.info(f"Scraping URL: {url}")
            
            # Scrape and collect articles
            articles = scrape_cnn_search_results(driver, url)
            all_articles.extend(articles)
            
            # Be nice to the server - add a delay
            time.sleep(2)
        
        # Create a consolidated CSV of all articles
        if all_articles:
            consolidated_df = pd.DataFrame(all_articles)
            consolidated_filename = f'cnn_all_articles_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
            consolidated_df.to_csv(consolidated_filename, index=False, encoding='utf-8')
            logging.info(f"Saved total {len(all_articles)} articles to {consolidated_filename}")
    
    except FileNotFoundError:
        logging.error("Error: CNN_Organised_Crime_URLs.csv file not found.")
    except Exception as e:
        logging.error(f"An error occurred: {e}")
    finally:
        # Always close the driver
        driver.quit()

In [63]:
if __name__ == '__main__':
    main()

2025-03-04 17:36:48,242 - INFO: Get LATEST chromedriver version for google-chrome
2025-03-04 17:36:48,469 - INFO: Get LATEST chromedriver version for google-chrome
2025-03-04 17:36:48,622 - INFO: Driver [C:\Users\Bertrand Tan\.wdm\drivers\chromedriver\win64\133.0.6943.141\chromedriver.exe] found in cache
2025-03-04 17:36:50,311 - INFO: WebDriver successfully initialized
2025-03-04 17:36:50,329 - INFO: Scraping URL: https://edition.cnn.com/search?q=%22Organised+Crime%22%2C+%22Drug+Trafficking%22&from=0&size=100&page=1&sort=relevance&types=article&section=
  (Session info: chrome=133.0.6943.127); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00F10B43+25139]
	(No symbol) [0x00EA13F4]
	(No symbol) [0x00D804E3]
	(No symbol) [0x00DC83D7]
	(No symbol) [0x00DC872B]
	(No symbol) [0x00DBDA81]
	(No symbol) [0x00DED014]
	(No symbol) [0x00DBD9A4]
	(No symbol) [0x00DED