In [85]:
!pip install pandas
!pip install requests
!pip install beautifulsoup4
!pip install lxml
!pip install html5lib
!pip install selenium webdriver-manager

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [86]:
import pandas as pd
import requests
import logging
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

import time
import csv
from datetime import datetime

In [87]:
# Configure logging
logging.basicConfig(
    level=logging.INFO, 
    format='%(asctime)s - %(levelname)s: %(message)s',
    handlers=[
        logging.FileHandler('cna_scraper_debug.log'),
        logging.StreamHandler()
    ]
)

In [88]:
def setup_webdriver():
    """
    Set up Selenium WebDriver with Chrome and comprehensive error handling
    
    Returns:
        webdriver: Configured Chrome WebDriver
    """
    try:
        # Chrome options
        chrome_options = Options()
        
        # More advanced anti-detection techniques
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--window-size=1920,1080')
        chrome_options.add_argument('--ignore-certificate-errors')
        chrome_options.add_argument('--allow-running-insecure-content')
        
        # Advanced anti-detection options
        chrome_options.add_argument('--disable-blink-features=AutomationControlled')
        chrome_options.add_experimental_option('useAutomationExtension', False)
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        
        # More sophisticated user agent
        chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
        
        # Attempt to install and configure WebDriver
        service = Service(ChromeDriverManager().install())
        
        # Create WebDriver with extensive error handling
        driver = webdriver.Chrome(service=service, options=chrome_options)
        
        # Additional configurations
        driver.set_page_load_timeout(60)  # Increased timeout
        
        # Additional anti-detection step
        driver.execute_script("Object.defineProperty(navigator, 'webdriver', { get: () => undefined })")
        
        logging.info("WebDriver successfully initialized")
        return driver
    
    except Exception as e:
        logging.error(f"Failed to initialize WebDriver: {e}")
        raise

In [89]:
def scrape_cna_search_results(driver, url):
    """
    Scrape article details from CNA search results page using Selenium
    
    Args:
        driver (webdriver): Selenium WebDriver
        url (str): URL of CNA search results page
    
    Returns:
        list: List of dictionaries containing article details
    """
    try:
        # Navigate to the URL with more verbose logging
        logging.info(f"Attempting to navigate to URL: {url}")
        driver.get(url)
        
        # Add more comprehensive wait and debug logging
        try:
            WebDriverWait(driver, 60).until(
                lambda d: d.execute_script('return document.readyState') == 'complete'
            )
            logging.info("Page fully loaded")
        except Exception as wait_error:
            logging.error(f"Page load wait failed: {wait_error}")
            # Take a screenshot for debugging
            driver.save_screenshot(f'debug_screenshot_{datetime.now().strftime("%Y%m%d_%H%M%S")}.png')
        
        # Extended wait and multiple selector attempts
        selectors_to_try = [
            '.list-object__heading',
            '.ais-list-object',
            '.search-results-list .item',
            'div[data-testid="search-result-item"]'
        ]
        
        article_links = []
        for selector in selectors_to_try:
            try:
                article_links = driver.find_elements(By.CSS_SELECTOR, selector)
                if article_links:
                    logging.info(f"Found {len(article_links)} articles using selector: {selector}")
                    break
            except Exception as sel_error:
                logging.warning(f"Selector {selector} failed: {sel_error}")
        
        if not article_links:
            logging.error("No articles found using any selectors")
            # Take a screenshot for debugging
            driver.save_screenshot(f'debug_no_articles_{datetime.now().strftime("%Y%m%d_%H%M%S")}.png')
            return []
        
        # Extract article details with more robust error handling
        articles = []
        for link in article_links:
            try:
                # More flexible title extraction
                title_selectors = [
                    '.a-list-object__heading-link',
                    '.list-object__heading-link',
                    'a.title',
                    'h3 a',
                    '.search-result-title'
                ]
                
                title = ''
                article_url = 'N/A'
                for title_selector in title_selectors:
                    try:
                        title_elem = link.find_element(By.CSS_SELECTOR, title_selector)
                        title = title_elem.text.strip()
                        article_url = title_elem.get_attribute('href')
                        if title and article_url:
                            break
                    except:
                        continue
                
                # Skip articles with empty titles
                if not title:
                    continue
                
                # More flexible date extraction
                date_selectors = [
                    '.hit-date',
                    'list-object__timestamp'
                    '.timestamp',
                    '.search-result-date',
                    '.article-date'
                ]
                
                article_date = 'N/A'
                for date_selector in date_selectors:
                    try:
                        date_elem = link.find_element(By.CSS_SELECTOR, date_selector)
                        article_date = date_elem.text.strip()
                        if article_date:
                            break
                    except:
                        continue
                
                # More flexible description extraction
                desc_selectors = [
                    '.hit-description',
                    '.search-result-description',
                    '.article-excerpt',
                    'p.description'
                ]
                
                description = 'N/A'
                for desc_selector in desc_selectors:
                    try:
                        desc_elem = link.find_element(By.CSS_SELECTOR, desc_selector)
                        description = desc_elem.text.strip()
                        if description:
                            break
                    except:
                        continue
                
                # Create article dictionary
                article = {
                    'Title': title,
                    'URL': article_url,
                    'Date': article_date,
                    'Description': description
                }
                
                articles.append(article)
            
            except Exception as elem_error:
                logging.warning(f"Error extracting individual article: {elem_error}")
        
        # Only save if we have articles
        if articles:
            # Generate a timestamped filename
            output_filename = f'cna_articles_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
            
            # Save to CSV with only non-empty title rows
            articles_df = pd.DataFrame(articles)
            articles_df = articles_df[articles_df['Title'].str.strip() != '']
            articles_df.to_csv(output_filename, index=False, encoding='utf-8')
            
            logging.info(f"Saved {len(articles_df)} articles to {output_filename}")
        else:
            logging.warning("No articles could be extracted")
        
        return articles
    
    except Exception as error:
        logging.error(f"Critical error scraping URL {url}: {error}")
        # Take a screenshot for debugging
        try:
            driver.save_screenshot(f'debug_critical_error_{datetime.now().strftime("%Y%m%d_%H%M%S")}.png')
        except:
            pass
        return []
    """
    Scrape article details from CNA search results page using Selenium
    
    Args:
        driver (webdriver): Selenium WebDriver
        url (str): URL of CNA search results page
    
    Returns:
        list: List of dictionaries containing article details
    """
    try:
        # Navigate to the URL with more verbose logging
        logging.info(f"Attempting to navigate to URL: {url}")
        driver.get(url)
        
        # Add more comprehensive wait and debug logging
        try:
            WebDriverWait(driver, 60).until(
                lambda d: d.execute_script('return document.readyState') == 'complete'
            )
            logging.info("Page fully loaded")
        except Exception as wait_error:
            logging.error(f"Page load wait failed: {wait_error}")
            # Take a screenshot for debugging
            driver.save_screenshot(f'debug_screenshot_{datetime.now().strftime("%Y%m%d_%H%M%S")}.png')
        
        # Extended wait and multiple selector attempts
        selectors_to_try = [
            '.ais-list-object',
            '.search-results-list .item',
            'div[data-testid="search-result-item"]'
        ]
        
        article_links = []
        for selector in selectors_to_try:
            try:
                article_links = driver.find_elements(By.CSS_SELECTOR, selector)
                if article_links:
                    logging.info(f"Found {len(article_links)} articles using selector: {selector}")
                    break
            except Exception as sel_error:
                logging.warning(f"Selector {selector} failed: {sel_error}")
        
        if not article_links:
            logging.error("No articles found using any selectors")
            # Take a screenshot for debugging
            driver.save_screenshot(f'debug_no_articles_{datetime.now().strftime("%Y%m%d_%H%M%S")}.png')
            return []
        
        # Extract article details with more robust error handling
        articles = []
        for link in article_links:
            try:
                # More flexible title extraction
                title_selectors = [
                    '.a-list-object__heading-link',
                    'a.title',
                    'h3 a',
                    '.search-result-title'
                ]
                
                title = 'N/A'
                article_url = 'N/A'
                for title_selector in title_selectors:
                    try:
                        title_elem = link.find_element(By.CSS_SELECTOR, title_selector)
                        title = title_elem.text.strip()
                        article_url = title_elem.get_attribute('href')
                        if title and article_url:
                            break
                    except:
                        continue
                
                # More flexible date extraction
                date_selectors = [
                    '.hit-date',
                    '.timestamp',
                    '.search-result-date',
                    '.article-date'
                ]
                
                article_date = 'N/A'
                for date_selector in date_selectors:
                    try:
                        date_elem = link.find_element(By.CSS_SELECTOR, date_selector)
                        article_date = date_elem.text.strip()
                        if article_date:
                            break
                    except:
                        continue
                
                # More flexible description extraction
                desc_selectors = [
                    '.hit-description',
                    '.search-result-description',
                    '.article-excerpt',
                    'p.description'
                ]
                
                description = 'N/A'
                for desc_selector in desc_selectors:
                    try:
                        desc_elem = link.find_element(By.CSS_SELECTOR, desc_selector)
                        description = desc_elem.text.strip()
                        if description:
                            break
                    except:
                        continue
                
                # Create article dictionary
                article = {
                    'Title': title,
                    'URL': article_url,
                    'Date': article_date,
                    'Description': description
                }
                
                articles.append(article)
            
            except Exception as elem_error:
                logging.warning(f"Error extracting individual article: {elem_error}")
        
        # Only save if we have articles
        if articles:
            # Generate a timestamped filename
            output_filename = f'cna_articles_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
            
            # Save to CSV
            articles_df = pd.DataFrame(articles)
            articles_df.to_csv(output_filename, index=False, encoding='utf-8')
            
            logging.info(f"Saved {len(articles)} articles to {output_filename}")
        else:
            logging.warning("No articles could be extracted")
        
        return articles
    
    except Exception as error:
        logging.error(f"Critical error scraping URL {url}: {error}")
        # Take a screenshot for debugging
        try:
            driver.save_screenshot(f'debug_critical_error_{datetime.now().strftime("%Y%m%d_%H%M%S")}.png')
        except:
            pass
        return []

In [90]:
def main():
    """
    Main function to process URLs and scrape search results
    """
    # Set up WebDriver
    driver = setup_webdriver()
    
    try:
        # Read URLs from CSV
        urls_df = pd.read_csv('CNA_Organised_Crime_URLs.csv')
        
        # Will store articles from all URLs
        all_articles = []
        
        # Iterate through URLs
        for index, row in urls_df.iterrows():
            url = row['URL']
            logging.info(f"Scraping URL: {url}")
            
            # Scrape and collect articles
            articles = scrape_cna_search_results(driver, url)
            all_articles.extend(articles)
            
            # Be nice to the server - add a delay
            time.sleep(2)
        
        # Create a consolidated CSV of all articles
        if all_articles:
            consolidated_df = pd.DataFrame(all_articles)
            consolidated_filename = f'cna_all_articles_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
            consolidated_df.to_csv(consolidated_filename, index=False, encoding='utf-8')
            logging.info(f"Saved total {len(all_articles)} articles to {consolidated_filename}")
    
    except FileNotFoundError:
        logging.error("Error: CNA_Organised_Crime_URLs.csv file not found.")
    except Exception as e:
        logging.error(f"An error occurred: {e}")
    finally:
        # Always close the driver
        driver.quit()

In [91]:
if __name__ == '__main__':
    main()

2025-03-06 09:46:50,595 - INFO: Get LATEST chromedriver version for google-chrome
2025-03-06 09:46:51,376 - INFO: Get LATEST chromedriver version for google-chrome
2025-03-06 09:46:51,854 - INFO: Get LATEST chromedriver version for google-chrome
2025-03-06 09:46:52,627 - INFO: WebDriver version 133.0.6943.141 selected
2025-03-06 09:46:52,637 - INFO: Modern chrome version https://storage.googleapis.com/chrome-for-testing-public/133.0.6943.141/win32/chromedriver-win32.zip
2025-03-06 09:46:52,640 - INFO: About to download new driver from https://storage.googleapis.com/chrome-for-testing-public/133.0.6943.141/win32/chromedriver-win32.zip
2025-03-06 09:46:53,129 - INFO: Driver downloading response is 200
2025-03-06 09:47:03,736 - INFO: Get LATEST chromedriver version for google-chrome
2025-03-06 09:47:04,987 - INFO: Driver has been saved in cache [C:\Users\Bertrand Tan\.wdm\drivers\chromedriver\win64\133.0.6943.141]
2025-03-06 09:47:07,147 - INFO: WebDriver successfully initialized
2025-03-