In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
import undetected_chromedriver as uc
import time
import pandas as pd
import random
import os
import json

# Session directory to maintain cookies between runs
SESSION_DIR = os.path.join(os.getcwd(), "chrome_session")
os.makedirs(SESSION_DIR, exist_ok=True)

# File to store checkpoint data
CHECKPOINT_FILE = "scraper_checkpoint.json"

def remove_existing_files():
    """Remove existing checkpoint and data files to ensure a fresh start"""
    if os.path.exists(CHECKPOINT_FILE):
        os.remove(CHECKPOINT_FILE)
    if os.path.exists('businessday_progress3.csv'):
        os.remove('businessday_progress3.csv')
    if os.path.exists('businessday_final3.csv'):
        os.remove('businessday_final3.csv')

def setup_driver():
    """Set up the undetected Chrome driver with necessary options"""
    options = uc.ChromeOptions()
    # Add user data directory to maintain session/cookies
    options.add_argument(f"--user-data-dir={SESSION_DIR}")
    # Make browser less detectable
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--disable-extensions")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-infobars")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--window-size=1920,1080")
    # Use a realistic user agent
    options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36")
    # Initialize undetected-chromedriver which helps bypass Cloudflare
    driver = uc.Chrome(options=options)
    # Set page load timeout
    driver.set_page_load_timeout(30)
    return driver

def random_sleep(min_seconds=2, max_seconds=5):
    """Sleep for a random amount of time to appear more human-like"""
    time.sleep(random.uniform(min_seconds, max_seconds))

def human_like_scroll(driver):
    """Scroll down the page in a human-like manner"""
    total_height = driver.execute_script("return document.body.scrollHeight")
    current_position = 0
    while current_position < total_height:
        scroll_increment = random.randint(100, 800)
        current_position += scroll_increment
        if current_position > total_height:
            current_position = total_height
        driver.execute_script(f"window.scrollTo(0, {current_position});")
        random_sleep(0.5, 1.5)

def scrape_page(driver, scraped_data):
    """
    Scrape the current page for articles.
    Each article is stored as a dictionary with content initially set to None.
    Returns a list of new articles scraped from this page.
    """
    if "Just a moment" in driver.title:
        print("Still on Cloudflare challenge page. Waiting...")
        time.sleep(15)
        return []

    print(f"Current page title: {driver.title}")
    human_like_scroll(driver)
    new_articles = []
    try:
        # Wait for the news container to be present
        news_container = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.news"))
        )
        # Find all news items
        news_items = news_container.find_elements(By.XPATH, './/div[@class="post-info"]')
        print(f"Found {len(news_items)} news items")
        if not news_items:
            print("No news items found on the page.")
            return new_articles

        for item in news_items:
            try:
                title_element = item.find_element(By.TAG_NAME, 'h2').find_element(By.TAG_NAME, 'a')
                title = title_element.text
                url = title_element.get_attribute("href")
                author = item.find_element(By.CLASS_NAME, 'post-author').find_element(By.TAG_NAME, 'a').text
                date = item.find_element(By.CLASS_NAME, 'post-date').text
                excerpt = item.find_element(By.TAG_NAME, 'p').text

                # Avoid duplicates by checking if an article with the same title already exists
                if not any(title == article['Title'] for article in scraped_data):
                    article_dict = {
                        'Title': title,
                        'Author': author,
                        'Date': date,
                        'Excerpt': excerpt,
                        'URL': url,
                        'Content': None  # Content will be filled in immediately after scraping the page
                    }
                    scraped_data.append(article_dict)
                    new_articles.append(article_dict)
                    print(f"Added article: {title[:50]}...")
                else:
                    print(f"Skipping duplicate article: {title[:50]}...")
            except Exception as e:
                print(f"Error processing news item: {e}")
                continue
        return new_articles
    except Exception as e:
        print(f"Error scraping page: {e}")
        return new_articles

def extract_article_content(driver, article):
    """
    Opens article URL in a new tab, extracts its content, and updates the article dictionary.
    """
    original_window = driver.current_window_handle
    try:
        # Open a new tab
        driver.execute_script("window.open('');")
        # Switch to new tab
        driver.switch_to.window(driver.window_handles[-1])
        driver.get(article['URL'])
        # Allow the page to load
        time.sleep(2)
        try:
            # Wait until the post content is present
            content_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'post-content'))
            )
            content = content_element.text
            article['Content'] = content.strip()
            print(f"Extracted content for: {article['Title'][:50]}...")
        except Exception as e:
            article['Content'] = 'Content not found'
            print(f"Content extraction failed for: {article['Title'][:50]}: {e}")
    except Exception as e:
        article['Content'] = 'Content extraction error'
        print(f"Error processing article URL {article['URL']}: {e}")
    finally:
        # Close the tab and switch back to the original window
        driver.close()
        driver.switch_to.window(original_window)

def main():
    print("Starting fresh scrape...")
    # Remove existing files to ensure a fresh start
    remove_existing_files()

    # Initialize fresh scraping variables
    page_count = 1
    start_url = 'https://businessday.ng/tag/bdlead/?amp'
    scraped_data = []

    # Setup the driver
    driver = setup_driver()

    try:
        # Navigate to the starting page
        print(f"Navigating to {start_url}")
        driver.get(start_url)
        print("Initial page loaded")
        time.sleep(5)

        # Set the number of pages you want to process
        max_pages = 3
        while page_count <= max_pages:
            print(f"\nProcessing page {page_count}")
            current_url = driver.current_url

            # Scrape the page and get new articles
            new_articles = scrape_page(driver, scraped_data)
            if new_articles:
                print(f"Successfully scraped page {page_count} with {len(new_articles)} new articles")
            else:
                print(f"Failed to scrape page {page_count} or found no new articles")

            # For each new article scraped on this page, immediately extract the full content
            for article in new_articles:
                print(f"Downloading content for article: {article['Title'][:50]}...")
                extract_article_content(driver, article)

            # Save intermediate progress after each page
            pd.DataFrame(scraped_data).to_csv('businessday_progress30.csv', index=False)

            # Break if we have reached our target
            if page_count >= max_pages:
                break

            # Navigate to the next page
            try:
                print("Looking for next page button...")
                next_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, '//a[@class="next page-numbers"]'))
                )
                driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_button)
                random_sleep(1, 2)
                print("Clicking next page button...")
                driver.execute_script("arguments[0].click();", next_button)
                print("Waiting for next page to load...")
                random_sleep(8, 12)
                page_count += 1
            except Exception as e:
                print(f"Error navigating to next page: {e}")
                print("Trying alternative method...")
                try:
                    if "page/" in current_url:
                        parts = current_url.split("page/")
                        current_page_num = int(parts[1].split("/")[0])
                        next_page_num = current_page_num + 1
                        next_url = f"{parts[0]}page/{next_page_num}/"
                    else:
                        next_url = f"{current_url}page/2/"
                    print(f"Navigating directly to: {next_url}")
                    driver.get(next_url)
                    random_sleep(8, 12)
                    page_count += 1
                except Exception as nav_error:
                    print(f"Failed to navigate to next page: {nav_error}")
                    break

        # Save the final results
        if scraped_data:
            df = pd.DataFrame(scraped_data, columns=['Title', 'Author', 'Date', 'Excerpt', 'URL', 'Content'])
            df.to_csv('businessday_final30.csv', index=False)
            print(f"\nScraping completed. Saved {len(scraped_data)} articles")
        else:
            print("No data was scraped")
    except KeyboardInterrupt:
        print("\nScraping interrupted by user.")
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        driver.quit()

if __name__ == "__main__":
    main()

Starting fresh scrape...
Navigating to https://businessday.ng/tag/bdlead/?amp
Initial page loaded

Processing page 1
Current page title: BDlead Archives - Businessday NG
Found 10 news items
Added article: Oil palm growers to replant 1.5m hectares on risin...
Added article: How CBEX wiped off investors’ N1.3trn in nine mont...
Added article: Metering hits 4-year low as FG misses target...
Added article: Investment Act opens window for digital asset mark...
Added article: Alake, Ayeni others to headline BusinessDay solid ...
Added article: Oyo, Kaduna, Kebbi record highest food inflation i...
Added article: Why March inflation defied  analyst projections, i...
Added article: DISCLAIMER: False attribution of article to Busine...
Added article: Access Bank acquires National Bank of Kenya to boo...
Added article: Full list of 51 people killed in Plateau Monday mo...
Successfully scraped page 1 with 10 new articles
Downloading content for article: Oil palm growers to replant 1.5m hectares on

In [9]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
import undetected_chromedriver as uc
import time
import pandas as pd
import random
import os
import json

# Session directory to maintain cookies between runs
SESSION_DIR = os.path.join(os.getcwd(), "chrome_session")
os.makedirs(SESSION_DIR, exist_ok=True)

def remove_existing_files():
    """Remove existing checkpoint and data files to ensure fresh start"""
    files_to_remove = ['businessday_progress3.csv', 'businessday_final3.csv']
    for file in files_to_remove:
        if os.path.exists(file):
            os.remove(file)

def setup_driver():
    """Set up the undetected Chrome driver with necessary options"""
    options = uc.ChromeOptions()
    options.add_argument(f"--user-data-dir={SESSION_DIR}")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--disable-extensions")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36")
    driver = uc.Chrome(options=options)
    driver.set_page_load_timeout(30)
    return driver

def random_sleep(min_seconds=1, max_seconds=3):
    """Human-like random delay"""
    time.sleep(random.uniform(min_seconds, max_seconds))

def human_like_scroll(driver):
    """Realistic scrolling behavior"""
    scroll_pauses = [random.randint(100, 300) for _ in range(random.randint(3, 6))]
    for pause in scroll_pauses:
        driver.execute_script(f"window.scrollBy(0, {pause});")
        random_sleep(0.2, 0.5)

def scrape_page(driver, scraped_data):
    """Extract articles from current page"""
    try:
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.news"))
        )
        human_like_scroll(driver)
        
        articles = driver.find_elements(By.CSS_SELECTOR, 'div.post-info')
        new_articles = []
        
        for article in articles:
            try:
                title_elem = article.find_element(By.CSS_SELECTOR, 'h2 a')
                title = title_elem.text
                if any(a['Title'] == title for a in scraped_data):
                    continue
                
                url = title_elem.get_attribute('href')
                author = article.find_element(By.CSS_SELECTOR, '.post-author a').text
                date = article.find_element(By.CSS_SELECTOR, '.post-date').text
                excerpt = article.find_element(By.CSS_SELECTOR, 'p').text
                
                new_articles.append({
                    'Title': title,
                    'Author': author,
                    'Date': date,
                    'Excerpt': excerpt,
                    'URL': url,
                    'Content': None
                })
            except Exception as e:
                print(f"Error processing article preview: {str(e)}")
                continue
        
        return new_articles
        
    except Exception as e:
        print(f"Page scraping error: {str(e)}")
        return []

def extract_article_content(driver, url):
    """Extract full content from article page"""
    main_window = driver.current_window_handle
    driver.execute_script("window.open('');")
    driver.switch_to.window(driver.window_handles[1])
    
    try:
        driver.get(url)
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '.post-content'))
        )
        content = driver.find_element(By.CSS_SELECTOR, '.post-content').text
        random_sleep(1, 2)  # Mimic reading time
    except Exception as e:
        print(f"Content extraction error: {str(e)}")
        content = "Content not available"
    finally:
        driver.close()
        driver.switch_to.window(main_window)
    
    return content.strip()

def main():
    remove_existing_files()
    driver = setup_driver()
    scraped_data = []
    max_pages = 3
    current_page = 1

    try:
        driver.get('https://businessday.ng/tag/bdlead/?amp')
        random_sleep(2, 4)
        
        while current_page <= max_pages:
            print(f"\nProcessing Page {current_page}")
            
            # Scrape articles from current page
            new_articles = scrape_page(driver, scraped_data)
            
            if not new_articles:
                print("No new articles found, stopping...")
                break
            
            # Process each new article immediately
            for article in new_articles:
                print(f"Extracting content for: {article['Title'][:50]}...")
                article['Content'] = extract_article_content(driver, article['URL'])
                scraped_data.append(article)
                
                # Save progress after each article
                pd.DataFrame(scraped_data).to_csv('businessday_progress30.csv', index=False)
                random_sleep(1, 3)  # Between articles
            
            # Save final state for the page
            pd.DataFrame(scraped_data).to_csv('businessday_final30.csv', index=False)
            
            # Navigate to next page
            try:
                next_btn = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, 'a.next.page-numbers'))
                )
                driver.execute_script("arguments[0].scrollIntoView();", next_btn)
                random_sleep(1, 2)
                next_btn.click()
                current_page += 1
                random_sleep(3, 5)  # Wait for next page load
            except Exception as e:
                print(f"Pagination error: {str(e)}")
                break

    except Exception as e:
        print(f"Main execution error: {str(e)}")
    finally:
        driver.quit()
        print("\nScraping completed. Final data saved.")

if __name__ == "__main__":
    main()


Processing Page 1
Extracting content for: Oil palm growers to replant 1.5m hectares on risin...
Main execution error: list index out of range

Scraping completed. Final data saved.


In [11]:
### scrapes with all columns including content together
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
import undetected_chromedriver as uc
import time
import pandas as pd
import random
import os
import json

# Session directory to maintain cookies between runs
SESSION_DIR = os.path.join(os.getcwd(), "chrome_session")
os.makedirs(SESSION_DIR, exist_ok=True)

# File to store checkpoint data
CHECKPOINT_FILE = "scraper_checkpoint.json"

def remove_existing_files():
    """Remove existing checkpoint and data files to ensure a fresh start"""
    if os.path.exists(CHECKPOINT_FILE):
        os.remove(CHECKPOINT_FILE)
    if os.path.exists('businessday_progress3.csv'):
        os.remove('businessday_progress3.csv')
    if os.path.exists('businessday_final3.csv'):
        os.remove('businessday_final3.csv')

def setup_driver():
    """Set up the undetected Chrome driver with necessary options"""
    options = uc.ChromeOptions()
    # Add user data directory to maintain session/cookies
    options.add_argument(f"--user-data-dir={SESSION_DIR}")
    # Make browser less detectable
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--disable-extensions")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-infobars")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--window-size=1920,1080")
    # Use a realistic user agent
    options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36")
    # Initialize undetected-chromedriver which helps bypass Cloudflare
    driver = uc.Chrome(options=options)
    # Set page load timeout
    driver.set_page_load_timeout(30)
    return driver

def random_sleep(min_seconds=2, max_seconds=5):
    """Sleep for a random amount of time to appear more human-like"""
    time.sleep(random.uniform(min_seconds, max_seconds))

def human_like_scroll(driver):
    """Scroll down the page in a human-like manner"""
    total_height = driver.execute_script("return document.body.scrollHeight")
    current_position = 0
    while current_position < total_height:
        scroll_increment = random.randint(100, 800)
        current_position += scroll_increment
        if current_position > total_height:
            current_position = total_height
        driver.execute_script(f"window.scrollTo(0, {current_position});")
        random_sleep(0.5, 1.5)

def scrape_page(driver, scraped_data):
    """
    Scrape the current page for articles.
    Each article is stored as a dictionary with content initially set to None.
    Returns a list of new articles scraped from this page.
    """
    if "Just a moment" in driver.title:
        print("Still on Cloudflare challenge page. Waiting...")
        time.sleep(15)
        return []

    print(f"Current page title: {driver.title}")
    human_like_scroll(driver)
    new_articles = []
    try:
        # Wait for the news container to be present
        news_container = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.news"))
        )
        # Find all news items
        news_items = news_container.find_elements(By.XPATH, './/div[@class="post-info"]')
        print(f"Found {len(news_items)} news items")
        if not news_items:
            print("No news items found on the page.")
            return new_articles

        for item in news_items:
            try:
                title_element = item.find_element(By.TAG_NAME, 'h2').find_element(By.TAG_NAME, 'a')
                title = title_element.text
                url = title_element.get_attribute("href")
                author = item.find_element(By.CLASS_NAME, 'post-author').find_element(By.TAG_NAME, 'a').text
                date = item.find_element(By.CLASS_NAME, 'post-date').text
                excerpt = item.find_element(By.TAG_NAME, 'p').text

                # Avoid duplicates by checking if an article with the same title already exists
                if not any(title == article['Title'] for article in scraped_data):
                    article_dict = {
                        'Title': title,
                        'Author': author,
                        'Date': date,
                        'Excerpt': excerpt,
                        'URL': url,
                        'Content': None  # Content will be filled in immediately after scraping the page
                    }
                    scraped_data.append(article_dict)
                    new_articles.append(article_dict)
                    print(f"Added article: {title[:50]}...")
                else:
                    print(f"Skipping duplicate article: {title[:50]}...")
            except Exception as e:
                print(f"Error processing news item: {e}")
                continue
        return new_articles
    except Exception as e:
        print(f"Error scraping page: {e}")
        return new_articles

def extract_article_content(driver, article):
    """
    Opens the article URL in a new tab (using Selenium's new_window), extracts its content,
    updates the article dictionary, and then closes the tab.
    """
    original_window = driver.current_window_handle
    try:
        # Open a new tab using Selenium 4's method.
        driver.switch_to.new_window('tab')
        driver.get(article['URL'])
        # Allow the page to load
        time.sleep(2)
        try:
            content_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'post-content'))
            )
            content = content_element.text
            article['Content'] = content.strip()
            print(f"Extracted content for: {article['Title'][:50]}...")
        except Exception as e:
            article['Content'] = 'Content not found'
            print(f"Content extraction failed for: {article['Title'][:50]}: {e}")
    except Exception as e:
        article['Content'] = 'Content extraction error'
        print(f"Error processing article URL {article['URL']}: {e}")
    finally:
        # Close the tab and switch back to the original window
        driver.close()
        driver.switch_to.window(original_window)

def main():
    print("Starting fresh scrape...")
    # Remove existing files to ensure a fresh start
    remove_existing_files()

    # Initialize fresh scraping variables
    page_count = 1
    start_url = 'https://businessday.ng/tag/bdlead/?amp'
    scraped_data = []

    # Setup the driver
    driver = setup_driver()

    try:
        # Navigate to the starting page
        print(f"Navigating to {start_url}")
        driver.get(start_url)
        print("Initial page loaded")
        time.sleep(5)

        # Set the number of pages you want to process
        max_pages = 3
        while page_count <= max_pages:
            print(f"\nProcessing page {page_count}")
            current_url = driver.current_url

            # Scrape the page and get new articles
            new_articles = scrape_page(driver, scraped_data)
            if new_articles:
                print(f"Successfully scraped page {page_count} with {len(new_articles)} new articles")
            else:
                print(f"Failed to scrape page {page_count} or found no new articles")

            # For each new article scraped on this page, immediately extract the full content
            for article in new_articles:
                print(f"Downloading content for article: {article['Title'][:50]}...")
                extract_article_content(driver, article)

            # Save intermediate progress after each page
            pd.DataFrame(scraped_data).to_csv('businessday_progress3.csv', index=False)

            # Break if we have reached our target
            if page_count >= max_pages:
                break

            # Navigate to the next page
            try:
                print("Looking for next page button...")
                next_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, '//a[@class="next page-numbers"]'))
                )
                driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_button)
                random_sleep(1, 2)
                print("Clicking next page button...")
                driver.execute_script("arguments[0].click();", next_button)
                print("Waiting for next page to load...")
                random_sleep(8, 12)
                page_count += 1
            except Exception as e:
                print(f"Error navigating to next page: {e}")
                print("Trying alternative method...")
                try:
                    if "page/" in current_url:
                        parts = current_url.split("page/")
                        current_page_num = int(parts[1].split("/")[0])
                        next_page_num = current_page_num + 1
                        next_url = f"{parts[0]}page/{next_page_num}/"
                    else:
                        next_url = f"{current_url}page/2/"
                    print(f"Navigating directly to: {next_url}")
                    driver.get(next_url)
                    random_sleep(8, 12)
                    page_count += 1
                except Exception as nav_error:
                    print(f"Failed to navigate to next page: {nav_error}")
                    break

        # Save the final results
        if scraped_data:
            df = pd.DataFrame(scraped_data, columns=['Title', 'Author', 'Date', 'Excerpt', 'URL', 'Content'])
            df.to_csv('businessday_final3.csv', index=False)
            print(f"\nScraping completed. Saved {len(scraped_data)} articles")
        else:
            print("No data was scraped")
    except KeyboardInterrupt:
        print("\nScraping interrupted by user.")
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        driver.quit()

if __name__ == "__main__":
    main()

Starting fresh scrape...
Navigating to https://businessday.ng/tag/bdlead/?amp
Initial page loaded

Processing page 1
Current page title: BDlead Archives - Businessday NG
Found 10 news items
Added article: Oil palm growers to replant 1.5m hectares on risin...
Added article: How CBEX wiped off investors’ N1.3trn in nine mont...
Added article: Metering hits 4-year low as FG misses target...
Added article: Investment Act opens window for digital asset mark...
Added article: Alake, Ayeni others to headline BusinessDay solid ...
Added article: Oyo, Kaduna, Kebbi record highest food inflation i...
Added article: Why March inflation defied  analyst projections, i...
Added article: DISCLAIMER: False attribution of article to Busine...
Added article: Access Bank acquires National Bank of Kenya to boo...
Added article: Full list of 51 people killed in Plateau Monday mo...
Successfully scraped page 1 with 10 new articles
Downloading content for article: Oil palm growers to replant 1.5m hectares on

In [22]:
### Scrapes all columns with content and continues from checkpoint
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
import undetected_chromedriver as uc
import time
import pandas as pd
import random
import os
import json

# Session directory to maintain cookies between runs
SESSION_DIR = os.path.join(os.getcwd(), "chrome_session")
os.makedirs(SESSION_DIR, exist_ok=True)

# File to store checkpoint data
CHECKPOINT_FILE = "scraper_checkpoint.json"

def load_checkpoint():
    """Load existing checkpoint if available."""
    if os.path.exists(CHECKPOINT_FILE):
        try:
            with open(CHECKPOINT_FILE, "r") as f:
                checkpoint = json.load(f)
            print(f"Loaded checkpoint: Page {checkpoint.get('page_count')} from URL: {checkpoint.get('current_url')}")
            return checkpoint
        except Exception as e:
            print(f"Error loading checkpoint: {e}")
    return None

def save_checkpoint(page_count, current_url):
    """Save current progress to a checkpoint file."""
    checkpoint = {
        "page_count": page_count,
        "current_url": current_url,
        "last_update": time.strftime("%Y-%m-%d %H:%M:%S")
    }
    try:
        with open(CHECKPOINT_FILE, "w") as f:
            json.dump(checkpoint, f)
        print(f"Checkpoint saved: Page {page_count} - URL: {current_url}")
    except Exception as e:
        print(f"Error saving checkpoint: {e}")

def load_existing_data():
    """Load previously scraped articles from CSV if available."""
    if os.path.exists('businessday_prog12345.csv'):
        try:
            df = pd.read_csv('businessday_prog12345.csv')
            print(f"Loaded {len(df)} existing articles from CSV")
            return df.to_dict('records')
        except Exception as e:
            print(f"Error loading existing data: {e}")
    return []

def setup_driver():
    """Set up the undetected Chrome driver with necessary options."""
    options = uc.ChromeOptions()
    options.add_argument(f"--user-data-dir={SESSION_DIR}")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--disable-extensions")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-infobars")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36")
    driver = uc.Chrome(options=options)
    driver.set_page_load_timeout(30)
    return driver

def random_sleep(min_seconds=2, max_seconds=5):
    """Sleep for a random amount of time to appear more human-like."""
    time.sleep(random.uniform(min_seconds, max_seconds))

def human_like_scroll(driver):
    """Scroll down the page in a human-like manner."""
    total_height = driver.execute_script("return document.body.scrollHeight")
    current_position = 0
    while current_position < total_height:
        scroll_increment = random.randint(100, 800)
        current_position += scroll_increment
        if current_position > total_height:
            current_position = total_height
        driver.execute_script(f"window.scrollTo(0, {current_position});")
        random_sleep(0.5, 1.5)

def scrape_page(driver, scraped_data):
    """
    Scrape the current page for articles.
    Each article is stored as a dictionary with content initially set to None.
    Returns a list of new articles scraped from this page.
    """
    if "Just a moment" in driver.title:
        print("Still on Cloudflare challenge page. Waiting...")
        time.sleep(15)
        return []

    print(f"Current page title: {driver.title}")
    human_like_scroll(driver)
    new_articles = []
    try:
        news_container = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.news"))
        )
        news_items = news_container.find_elements(By.XPATH, './/div[@class="post-info"]')
        print(f"Found {len(news_items)} news items")
        if not news_items:
            print("No news items found on the page.")
            return new_articles

        for item in news_items:
            try:
                title_element = item.find_element(By.TAG_NAME, 'h2').find_element(By.TAG_NAME, 'a')
                title = title_element.text
                url = title_element.get_attribute("href")
                author = item.find_element(By.CLASS_NAME, 'post-author').find_element(By.TAG_NAME, 'a').text
                date = item.find_element(By.CLASS_NAME, 'post-date').text
                excerpt = item.find_element(By.TAG_NAME, 'p').text

                if not any(title == article['Title'] for article in scraped_data):
                    article_dict = {
                        'Title': title,
                        'Author': author,
                        'Date': date,
                        'Excerpt': excerpt,
                        'URL': url,
                        'Content': None
                    }
                    scraped_data.append(article_dict)
                    new_articles.append(article_dict)
                    print(f"Added article: {title[:50]}...")
                else:
                    print(f"Skipping duplicate article: {title[:50]}...")
            except Exception as e:
                print(f"Error processing news item: {e}")
                continue
        return new_articles
    except Exception as e:
        print(f"Error scraping page: {e}")
        return new_articles

def extract_article_content(driver, article):
    """
    Open the article URL in a new tab (using Selenium's new window method),
    extract its content, update the article dict, and then close the tab.
    """
    original_window = driver.current_window_handle
    try:
        driver.switch_to.new_window('tab')
        driver.get(article['URL'])
        time.sleep(2)
        try:
            content_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'post-content'))
            )
            content = content_element.text
            article['Content'] = content.strip()
            print(f"Extracted content for: {article['Title'][:50]}...")
        except Exception as e:
            article['Content'] = 'Content not found'
            print(f"Content extraction failed for: {article['Title'][:50]}: {e}")
    except Exception as e:
        article['Content'] = 'Content extraction error'
        print(f"Error processing article URL {article['URL']}: {e}")
    finally:
        driver.close()
        driver.switch_to.window(original_window)

def main():
    print("Starting scraper...")

    # Load checkpoint and existing articles if available.
    checkpoint = load_checkpoint()
    scraped_data = load_existing_data()

    # Determine starting page and URL.
    if checkpoint:
        page_count = checkpoint.get("page_count", 1)
        start_url = checkpoint.get("current_url", "https://businessday.ng/tag/bdlead/?amp")
    else:
        page_count = 1
        start_url = 'https://businessday.ng/tag/bdlead/?amp'
        print("No checkpoint found. Starting from page 1.")

    # Setup the driver.
    driver = setup_driver()

    try:
        print(f"Navigating to {start_url}")
        driver.get(start_url)
        print("Initial page loaded")
        time.sleep(5)

        # Set the maximum pages to process (adjust as needed).
        max_pages = 820
        while page_count <= max_pages:
            print(f"\nProcessing page {page_count}")
            current_url = driver.current_url

            new_articles = scrape_page(driver, scraped_data)
            if new_articles:
                print(f"Successfully scraped page {page_count} with {len(new_articles)} new articles")
            else:
                print(f"Failed to scrape page {page_count} or found no new articles")

            # Extract content for new articles immediately.
            for article in new_articles:
                print(f"Downloading content for article: {article['Title'][:50]}...")
                extract_article_content(driver, article)

            # Save intermediate progress.
            pd.DataFrame(scraped_data).to_csv('businessday_progress301.csv', index=False)
            save_checkpoint(page_count, current_url)

            if page_count >= max_pages:
                break

            # Navigate to the next page.
            try:
                print("Looking for next page button...")
                next_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, '//a[@class="next page-numbers"]'))
                )
                driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_button)
                random_sleep(1, 2)
                print("Clicking next page button...")
                driver.execute_script("arguments[0].click();", next_button)
                print("Waiting for next page to load...")
                random_sleep(8, 12)
                page_count += 1
            except Exception as e:
                print(f"Error navigating to next page: {e}")
                print("Trying alternative method...")
                try:
                    if "page/" in current_url:
                        parts = current_url.split("page/")
                        current_page_num = int(parts[1].split("/")[0])
                        next_page_num = current_page_num + 1
                        next_url = f"{parts[0]}page/{next_page_num}/"
                    else:
                        next_url = f"{current_url}page/2/"
                    print(f"Navigating directly to: {next_url}")
                    driver.get(next_url)
                    random_sleep(8, 12)
                    page_count += 1
                except Exception as nav_error:
                    print(f"Failed to navigate to next page: {nav_error}")
                    break

        # Save final results.
        if scraped_data:
            df = pd.DataFrame(scraped_data, columns=['Title', 'Author', 'Date', 'Excerpt', 'URL', 'Content'])
            df.to_csv('businessday_final301.csv', index=False)
            print(f"\nScraping completed. Saved {len(scraped_data)} articles")
        else:
            print("No data was scraped")

    except KeyboardInterrupt:
        print("\nScraping interrupted by user.")
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        driver.quit()

if __name__ == "__main__":
    main()

Starting scraper...
Loaded checkpoint: Page 820 from URL: https://businessday.ng/tag/bdlead/page/808/
Loaded 3275 existing articles from CSV
Navigating to https://businessday.ng/tag/bdlead/page/808/
Initial page loaded

Processing page 820
Still on Cloudflare challenge page. Waiting...
Failed to scrape page 820 or found no new articles
Checkpoint saved: Page 820 - URL: https://businessday.ng/tag/bdlead/page/808/

Scraping completed. Saved 3275 articles


In [32]:
### Scrapes all columns with content and continues from checkpoint
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
import undetected_chromedriver as uc
import time
import pandas as pd
import random
import os
import json

# Session directory to maintain cookies between runs
SESSION_DIR = os.path.join(os.getcwd(), "chrome_session")
os.makedirs(SESSION_DIR, exist_ok=True)

# File to store checkpoint data
CHECKPOINT_FILE = "scraper_checkpoint.json"

def load_checkpoint():
    """Load existing checkpoint if available."""
    if os.path.exists(CHECKPOINT_FILE):
        try:
            with open(CHECKPOINT_FILE, "r") as f:
                checkpoint = json.load(f)
            print(f"Loaded checkpoint: Page {checkpoint.get('page_count')} from URL: {checkpoint.get('current_url')}")
            return checkpoint
        except Exception as e:
            print(f"Error loading checkpoint: {e}")
    return None

def save_checkpoint(page_count, current_url):
    """Save current progress to a checkpoint file."""
    checkpoint = {
        "page_count": page_count,
        "current_url": current_url,
        "last_update": time.strftime("%Y-%m-%d %H:%M:%S")
    }
    try:
        with open(CHECKPOINT_FILE, "w") as f:
            json.dump(checkpoint, f)
        print(f"Checkpoint saved: Page {page_count} - URL: {current_url}")
    except Exception as e:
        print(f"Error saving checkpoint: {e}")

def load_existing_data():
    """Load previously scraped articles from CSV if available."""
    if os.path.exists('businessday_prog12345.csv'):
        try:
            df = pd.read_csv('businessday_prog12345.csv')
            print(f"Loaded {len(df)} existing articles from CSV")
            return df.to_dict('records')
        except Exception as e:
            print(f"Error loading existing data: {e}")
    return []

def setup_driver():
    """Set up the undetected Chrome driver with necessary options."""
    options = uc.ChromeOptions()
    options.add_argument(f"--user-data-dir={SESSION_DIR}")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--disable-extensions")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-infobars")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36")
    driver = uc.Chrome(options=options)
    driver.set_page_load_timeout(30)
    return driver

def random_sleep(min_seconds=2, max_seconds=5):
    """Sleep for a random amount of time to appear more human-like."""
    time.sleep(random.uniform(min_seconds, max_seconds))

def human_like_scroll(driver):
    """Scroll down the page in a human-like manner."""
    total_height = driver.execute_script("return document.body.scrollHeight")
    current_position = 0
    while current_position < total_height:
        scroll_increment = random.randint(100, 800)
        current_position += scroll_increment
        if current_position > total_height:
            current_position = total_height
        driver.execute_script(f"window.scrollTo(0, {current_position});")
        random_sleep(0.5, 1.5)

def scrape_page(driver, scraped_data):
    """
    Scrape the current page for articles.
    Each article is stored as a dictionary with content initially set to None.
    Returns a list of new articles scraped from this page.
    """
    if "Just a moment" in driver.title:
        print("Still on Cloudflare challenge page. Waiting...")
        time.sleep(15)
        return []

    print(f"Current page title: {driver.title}")
    human_like_scroll(driver)
    new_articles = []
    try:
        news_container = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.news"))
        )
        news_items = news_container.find_elements(By.XPATH, './/div[@class="post-info"]')
        print(f"Found {len(news_items)} news items")
        if not news_items:
            print("No news items found on the page.")
            return new_articles

        for item in news_items:
            try:
                title_element = item.find_element(By.TAG_NAME, 'h2').find_element(By.TAG_NAME, 'a')
                title = title_element.text
                url = title_element.get_attribute("href")
                author = item.find_element(By.CLASS_NAME, 'post-author').find_element(By.TAG_NAME, 'a').text
                date = item.find_element(By.CLASS_NAME, 'post-date').text
                excerpt = item.find_element(By.TAG_NAME, 'p').text

                if not any(title == article['Title'] for article in scraped_data):
                    article_dict = {
                        'Title': title,
                        'Author': author,
                        'Date': date,
                        'Excerpt': excerpt,
                        'URL': url,
                        'Content': None
                    }
                    scraped_data.append(article_dict)
                    new_articles.append(article_dict)
                    print(f"Added article: {title[:50]}...")
                else:
                    print(f"Skipping duplicate article: {title[:50]}...")
            except Exception as e:
                print(f"Error processing news item: {e}")
                continue
        return new_articles
    except Exception as e:
        print(f"Error scraping page: {e}")
        return new_articles

def extract_article_content(driver, article):
    """
    Open the article URL in a new tab (using Selenium's new window method),
    extract its content, update the article dict, and then close the tab.
    """
    original_window = driver.current_window_handle
    try:
        driver.switch_to.new_window('tab')
        driver.get(article['URL'])
        time.sleep(2)
        try:
            content_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'post-content'))
            )
            content = content_element.text
            article['Content'] = content.strip()
            print(f"Extracted content for: {article['Title'][:50]}...")
        except Exception as e:
            article['Content'] = 'Content not found'
            print(f"Content extraction failed for: {article['Title'][:50]}: {e}")
    except Exception as e:
        article['Content'] = 'Content extraction error'
        print(f"Error processing article URL {article['URL']}: {e}")
    finally:
        driver.close()
        driver.switch_to.window(original_window)

def main():
    print("Starting scraper...")

    # Load checkpoint and existing articles if available.
    checkpoint = load_checkpoint()
    scraped_data = load_existing_data()

    # Determine starting page and URL.
    if checkpoint:
        page_count = checkpoint.get("page_count", 1)
        current_url = checkpoint.get("current_url", "https://businessday.ng/tag/bdlead/?amp")
        print(f"Resuming from page {page_count} at {current_url}")
    else:
        page_count = 1
        current_url = 'https://businessday.ng/tag/bdlead/?amp'
        print("No checkpoint found. Starting from page 1.")
    driver = setup_driver()

    try:
        driver.get(current_url)
        print("Page loaded. Waiting a moment for any challenges…")
        time.sleep(5)

        while True:
            print(f"\nProcessing page {page_count}")
            new_articles = scrape_page(driver, scraped_data)
            if new_articles:
                print(f"  → {len(new_articles)} new articles found")
            else:
                print("  → No new articles found on this page")

            # Immediately pull full content for each new article
            for art in new_articles:
                print(f"    * Downloading content for: {art['Title'][:40]}…")
                extract_article_content(driver, art)

            # Save progress and checkpoint
            pd.DataFrame(scraped_data).to_csv('businessday_progress3.csv', index=False)
            save_checkpoint(page_count, driver.current_url)

            # Try clicking “Next”…
            try:
                next_btn = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, '//a[@class="next page-numbers"]'))
                )
                driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_btn)
                random_sleep(1, 2)
                next_btn.click()
                random_sleep(8, 12)
                page_count += 1
                continue
            except Exception:
                # Fallback: try constructing the next-page URL
                print("Next button not found—trying direct URL method…")
                try:
                    url = driver.current_url
                    if "page/" in url:
                        base, num = url.rsplit("page/", 1)
                        next_page = int(num.split("/")[0]) + 1
                        next_url = f"{base}page/{next_page}/"
                    else:
                        next_url = url.rstrip("/") + "/page/2/"
                    print(f"Navigating directly to {next_url}")
                    driver.get(next_url)
                    random_sleep(8,12)
                    page_count += 1
                    continue
                except Exception:
                    print("Could not navigate to the next page. Assuming end of pages.")
                    break

        # final save
        if scraped_data:
            df = pd.DataFrame(scraped_data,
                              columns=['Title','Author','Date','Excerpt','URL','Content'])
            df.to_csv('businessday_final3.csv', index=False)
            print(f"\nScraping completed. Saved {len(scraped_data)} articles.")
        else:
            print("No data was scraped.")
    except KeyboardInterrupt:
        print("\nInterrupted by user—saving progress!")
        pd.DataFrame(scraped_data).to_csv('businessday_progress3.csv', index=False)
        save_checkpoint(page_count, driver.current_url)
    finally:
        driver.quit()
        
if __name__ == "__main__":
    main()

Starting scraper...
Loaded checkpoint: Page 936 from URL: https://businessday.ng/tag/bdlead/page/924/
Loaded 3275 existing articles from CSV
Resuming from page 936 at https://businessday.ng/tag/bdlead/page/924/
Page loaded. Waiting a moment for any challenges…

Processing page 936
Current page title: BDlead Archives - Page 924 of 1643 - Businessday NG
Found 0 news items
No news items found on the page.
  → No new articles found on this page


PermissionError: [Errno 13] Permission denied: 'businessday_progress3.csv'

In [68]:
!python webs_scraper.ipynb

python: can't open file 'C:\\Users\\USER\\webs_scraper.ipynb': [Errno 2] No such file or directory


In [66]:
!rm scraper_checkpoint.json businessday_progress3.csv
!python webs_scraper.ipynb

'rm' is not recognized as an internal or external command,
operable program or batch file.
python: can't open file 'C:\\Users\\USER\\webs_scraper.ipynb': [Errno 2] No such file or directory


In [70]:
jupyter notebook

SyntaxError: invalid syntax (3305369356.py, line 1)