In [9]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
from random import randint
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# Configure headers to mimic a real browser
headers = {
    "Accept-Language": "en-US,en;q=0.5",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
}

# Configure retry strategy
retry_strategy = Retry(
    total=5,
    backoff_factor=1,
    status_forcelist=[429, 500, 502, 503, 504],
    allowed_methods=["HEAD", "GET", "OPTIONS"]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session = requests.Session()
session.mount("https://", adapter)

# Lists to store data
article_title = []
article_date = []
article_content = []
article_url = []
article_author = []

pages = np.arange(1, 4)  # Adjust number of pages as needed

for page_num in pages:
    page_url = f"https://businessday.ng/tag/bdlead/page/{page_num}/?amp"
    
    try:
        # Respectful delay with random interval
        time.sleep(randint(3, 10))
        
        # Fetch the page with retries
        response = session.get(page_url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        articles = soup.find_all('div', class_='post-info')
        
        if not articles:
            print(f"No articles found on page {page_num}. Stopping.")
            break
            
        for article in articles:
            # Extract Title and URL from listing page
            title = article.find('h2', class_='post-title')
            if title and title.find('a'):
                article_title.append(title.text.strip())
                url = title.find('a')['href']
                article_url.append(url)
            else:
                # Skip articles without title/URL
                continue
                
            try:
                # Visit individual article page
                time.sleep(randint(1, 3))  # Add delay between article requests
                article_res = session.get(url, headers=headers, timeout=10)
                article_res.raise_for_status()
                article_soup = BeautifulSoup(article_res.text, 'lxml')

                # Extract content from article page
                content_tag = article_soup.find('div', class_='post-content')
                article_content.append(
                    content_tag.get_text(strip=True) if content_tag 
                    else 'No content available'
                )

                # Extract author from article page
                author_tag = article_soup.find('p', class_='author-name')
                article_author.append(
                    author_tag.text.strip() if author_tag 
                    else 'No Author'
                )

                # Extract date from article page
                date_tag = article_soup.find('p', class_='post-date')
                article_date.append(
                    date_tag.text.strip() if date_tag 
                    else 'No Date'
                )

            except Exception as e:
                print(f"Error processing article {url}: {e}")
                # Append placeholders if article page fails
                article_content.append('Content unavailable')
                article_author.append('Author unavailable')
                article_date.append('Date unavailable')
                continue
                
        print(f"Successfully processed page {page_num}")
        
    except requests.exceptions.RequestException as e:
        print(f"Error fetching page {page_num}: {e}")
        continue
    except Exception as e:
        print(f"General error on page {page_num}: {e}")
        continue

# Create DataFrame
articles_df = pd.DataFrame({
    "Title": article_title,
    "Date": article_date,
    "Content": article_content,
    "URL": article_url,
    "Author": article_author
})

# Save to CSV
articles_df.to_csv("businessday00_articles00.csv", index=False)
print("Scraping completed successfully!")
print(f"Collected {len(articles_df)} articles")

Error fetching page 1: 403 Client Error: Forbidden for url: https://businessday.ng/tag/bdlead/page/1/?amp
Error fetching page 2: 403 Client Error: Forbidden for url: https://businessday.ng/tag/bdlead/page/2/?amp
Error fetching page 3: 403 Client Error: Forbidden for url: https://businessday.ng/tag/bdlead/page/3/?amp
Scraping completed successfully!
Collected 0 articles


In [57]:
articles_df.head()

Unnamed: 0,Title,Date,Content,URL,Author
0,Exclusive: House bill to stop age restrictions...,"April 11, 2025",…as banks risk suspension of operating licence...,https://businessday.ng/news/article/exclusive-...,Godsgift Onyedinefu
1,Naira’s 3.6% fall against USD ‘reasonable’ com...,"April 11, 2025",Naira’s steep fall against the US dollar over ...,https://businessday.ng/news/article/nairas-3-6...,Wasiu Alli
2,"Financial constraints, last-minute changes del...","April 11, 2025",The Federal Government has completed the vetti...,https://businessday.ng/news/article/financial-...,Taofeek Oyedokun
3,Naira blip temporary as analysts bet on rebound,"April 11, 2025",The naira has weakened against the dollar due ...,https://businessday.ng/pro/article/naira-blip-...,Eniola Olatunji
4,Airlines slash fares to Europe on low travels,"April 11, 2025",…As global carriers cut forecastsAs the aviati...,https://businessday.ng/aviation/article/airlin...,Ifeoma Okeke-Korieocha


In [65]:
pip install cloudscraper

Collecting cloudscraperNote: you may need to restart the kernel to use updated packages.

  Downloading cloudscraper-1.2.71-py2.py3-none-any.whl.metadata (19 kB)
Downloading cloudscraper-1.2.71-py2.py3-none-any.whl (99 kB)
Installing collected packages: cloudscraper
Successfully installed cloudscraper-1.2.71


In [1]:
pip install pandas cloudscraper beautifulsoup4




In [5]:
pip install pandas requests requests-html numpy bs4


Collecting requests-html
  Downloading requests_html-0.10.0-py3-none-any.whl.metadata (15 kB)
Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting pyquery (from requests-html)
  Downloading pyquery-2.0.1-py3-none-any.whl.metadata (9.0 kB)
Collecting parse (from requests-html)
  Downloading parse-1.20.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting pyppeteer>=0.0.14 (from requests-html)
  Downloading pyppeteer-2.0.0-py3-none-any.whl.metadata (7.1 kB)
Collecting pyee<12.0.0,>=11.0.0 (from pyppeteer>=0.0.14->requests-html)
  Downloading pyee-11.1.1-py3-none-any.whl.metadata (2.8 kB)
Collecting urllib3<3,>=1.21.1 (from requests)
  Downloading urllib3-1.26.20-py2.py3-none-any.whl.metadata (50 kB)
Collecting websockets<11.0,>=10.0 (from pyppeteer>=0.0.14->requests-html)
  Downloading websockets-10.4.tar.gz (84 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Downloading requests_html-0.10.0-py

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
conda-repo-cli 1.0.114 requires urllib3>=2.2.2, but you have urllib3 1.26.20 which is incompatible.


In [9]:
pip install lxml-html-clean==0.1.1

Note: you may need to restart the kernel to use updated packages.


In [11]:
pip install requests-html

Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install cloudscraper




In [None]:
import cloudscraper
from bs4 import BeauttifulSoup
scraper = cloudscraper.create_scraper() 

soup = Beautifulsoup(scraper.get("https://businessday.ng/tag/bdlead/page/{page_num}/?amp").text, 'html parser')

print(soup. text)

In [11]:
import pandas as pd
import numpy as np
import cloudscraper
from bs4 import BeautifulSoup
import time
from random import randint
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# Configure headers to mimic a real browser
headers = {
    "Accept-Language": "en-US,en;q=0.5",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
}

# Configure retry strategy
retry_strategy = Retry(
    total=8,
    backoff_factor=1,
    status_forcelist=[429, 500, 502, 503, 504],
    allowed_methods=["HEAD", "GET", "OPTIONS"]
)
adapter = HTTPAdapter(max_retries=retry_strategy)

# Create cloudscraper instance with retry capabilities
scraper = cloudscraper.create_scraper()
scraper.mount("https://", adapter)
scraper.mount("http://", adapter)

# Lists to store data
article_title = []
article_date = []
article_content = []
article_url = []
article_author = []

pages = np.arange(1, 4)  # Adjust number of pages as needed

for page_num in pages:
    page_url = f"https://businessday.ng/tag/bdlead/page/{page_num}/?amp"
    
    try:
        # Respectful delay with random interval
        time.sleep(randint(3, 10))
        
        # Fetch the page with retries and cloudscraper
        response = scraper.get(page_url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        articles = soup.find_all('div', class_='post-info')
        
        if not articles:
            print(f"No articles found on page {page_num}. Stopping.")
            break
            
        for article in articles:
            # Extract Title and URL from listing page
            title = article.find('h2', class_='post-title')
            if title and title.find('a'):
                article_title.append(title.text.strip())
                url = title.find('a')['href']
                article_url.append(url)
            else:
                continue  # Skip articles without title/URL
                
            try:
                # Visit individual article page
                time.sleep(randint(1, 3))
                article_res = scraper.get(url, headers=headers, timeout=10)
                article_res.raise_for_status()
                article_soup = BeautifulSoup(article_res.text, 'lxml')

                # Extract content from article page
                content_tag = article_soup.find('div', class_='post-content')
                article_content.append(
                    content_tag.get_text(strip=True) if content_tag 
                    else 'No content available'
                )

                # Extract author from article page
                author_tag = article_soup.find('p', class_='author-name')
                article_author.append(
                    author_tag.text.strip() if author_tag 
                    else 'No Author'
                )

                # Extract date from article page
                date_tag = article_soup.find('p', class_='post-date')
                article_date.append(
                    date_tag.text.strip() if date_tag 
                    else 'No Date'
                )

            except Exception as e:
                print(f"Error processing article {url}: {e}")
                # Append placeholders if article page fails
                article_content.append('Content unavailable')
                article_author.append('Author unavailable')
                article_date.append('Date unavailable')
                continue
                
        print(f"Successfully processed page {page_num}")
        
    except Exception as e:
        print(f"Error fetching/processing page {page_num}: {e}")
        continue

# Create DataFrame
articles_df = pd.DataFrame({
    "Title": article_title,
    "Date": article_date,
    "Content": article_content,
    "URL": article_url,
    "Author": article_author
})

# Save to CSV
articles_df.to_csv("businessday_articles111.csv", index=False)
print("Scraping completed successfully!")
print(f"Collected {len(articles_df)} articles")

Error fetching/processing page 1: 403 Client Error: Forbidden for url: https://businessday.ng/tag/bdlead/page/1/?amp
Error fetching/processing page 2: 403 Client Error: Forbidden for url: https://businessday.ng/tag/bdlead/page/2/?amp
Error fetching/processing page 3: 403 Client Error: Forbidden for url: https://businessday.ng/tag/bdlead/page/3/?amp
Scraping completed successfully!
Collected 0 articles
