In [None]:
#Importing necessary libraries
import requests
import concurrent.futures
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
import pandas as pd

# Create a session object (for code optimization)
session = requests.Session()

#This function is used to extract the article urls from the archive page of specific date.
def extract_article_url(article_tag):
    date = article_tag['date']
    page = article_tag['page']
    id_ = article_tag['id']
    type_ = article_tag['type']
    typeUri = type_ + '/' if type_ == 'attachment' else ''
    newsUri = 'picture' if type_ == 'attachment' else type_
    return f"https://epaper.brecorder.com/{date}/{page}-page/{typeUri}{id_}-{newsUri}.html"

#This function is used for extracting the HTML content of the url.
def crawl_article(url):
    response = session.get(url)
    if response.status_code == 200:
        return response.text
    else:
        print("Failed to fetch the article:", url)
        return None

#Function to get relevant HTML information from the webpage. For news articles, their title, date, url and content has been extracted.
def extract_article_info(url, article_html):
    soup = BeautifulSoup(article_html, 'html.parser')

    # Extract title
    title_tag = soup.find('h2')
    title = title_tag.text.strip() if title_tag else None

    # Extract date
    date_tag = soup.find('span', class_="story__time")
    date = date_tag.text.strip() if date_tag else None

    # Extract content
    content_tag = soup.find('p')
    content = content_tag.text.strip() if content_tag else None

    return {'url': url, 'date': date, 'title': title, 'content': content}

#It will concurrently scrap multiple articles.
def scrape_articles(article_tags):
    articles = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Submit HTTP requests concurrently
        futures = [executor.submit(crawl_article, extract_article_url(article_tag)) for article_tag in article_tags]
        for future, article_tag in zip(futures, article_tags):
            url = extract_article_url(article_tag)
            article_html = future.result()
            if article_html:
                article_info = extract_article_info(url, article_html)
                articles.append(article_info)
    return articles

#This function will use a loop to iterate over all the dates that are provided.
def crawl_website(base_url, start_date, end_date, max_pages):
    news = []
    current_date = start_date
    while current_date <= end_date:
        page = 1
        while page <= max_pages:
            formatted_date = current_date.strftime("%Y/%m/%d")
            url = f"{base_url}{formatted_date}/{page}-page.html"
            response = session.get(url)
            if response.status_code == 200:
                html = response.text
                soup = BeautifulSoup(html, 'html.parser')
                article_tags = soup.find_all('rect', class_='news')
                articles = scrape_articles(article_tags)
                news.extend(articles)
            else:
                print(f"Failed to fetch page {page} for date {formatted_date}")
            page += 1
        current_date += timedelta(days=1)
    return news


base_url = "https://epaper.brecorder.com/"
start_date = datetime(2018, 1, 1)  # Start date
end_date = datetime(2024, 3, 16)    # End date
max_pages = 8
news = crawl_website(base_url, start_date, end_date, max_pages)

# Closing the session
session.close()

# Now convert it into a dataframe and save into a csv
df = pd.DataFrame(news)
df.to_csv('News.csv', index=False)
