In [None]:
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import os
import time
class CoinDeskScraper:
    def __init__(self):
        self.base_url = "https://www.coindesk.com"
        self.tag_base_url = "https://www.coindesk.com/tag/bitcoin/"
        self.last_page = None
        self.user_agent = UserAgent()
    def get_fake_user_agent(self):
        """Get a fake user agent string."""
        try:
            return self.user_agent.random
        except:
            return "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"

    def get_request_headers(self):
        """Return headers with a fake user agent."""
        return {
            "User-Agent": self.get_fake_user_agent()
        }

    def get_article_type(self, article):
        """Extract the article type from the article using BeautifulSoup"""
        article_type_tag = article.find("a", class_="category")
        if article_type_tag:
            return article_type_tag.get_text(strip=True)
        return None
    def find_article_subheadline(self, soup):
        """Extract the subheadline from the BeautifulSoup object."""
        subheadline_div = soup.find("div", class_="at-subheadline")
        if subheadline_div:
            subheadline = subheadline_div.find("h2", class_="typography__StyledTypography-sc-owin6q-0 irVmAp")
            if subheadline:
                return subheadline.get_text(strip=True)
        return None
    
    def extract_article_structure(self, article_content_soup):
        """Extract and structure content based on the headings found within the article."""
        headings = ['h2', 'h3', 'h4', 'h5', 'h6', 'h7']
        
        sections = []
        all_tags = article_content_soup.find_all(True)  # Get all tags inside the article div
    
        current_section = {"tag": None, "text": [], "heading": None}
        for tag in all_tags:
            if tag.name in headings:
                # If there's content already, append the section and start a new one
                if current_section["text"]:
                    current_section["text"] = ' '.join(current_section["text"])
                    sections.append(current_section)
                    current_section = {"tag": None, "text": [], "heading": None}
                current_section["tag"] = tag.name
                current_section["heading"] = tag.get_text(strip=True)
            else:  
                # Gather content for the current section
                current_section["text"].append(tag.get_text(strip=True))
    
        # Append the remaining content
        if current_section["text"]:
            current_section["text"] = ' '.join(current_section["text"])
            sections.append(current_section)
    
        return sections

    def get_article_body(self, link):
        """Extract and structure the content of the article based on the headings."""
        headers = self.get_request_headers()
        try:
            response = requests.get(self.base_url + link, headers=headers)
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                
                article_body_div = soup.find("div", {"data-module-name": "article-body"})
                if article_body_div:
                    # Extract structured content
                    structured_content = self.extract_article_structure(article_body_div)
                    for section in structured_content:
                        print(f"\n{section['tag'].upper() if section['tag'] else 'CONTENT'}: {section['heading'] if section['heading'] else ''}")
                        print(section["text"])
    
            else:
                print(response.status_code)
            return None
        except Exception as e:
            print(f"Error fetching article body: {e}")
            return None


    def get_article_title_and_link(self, article):
        """Extract the article title and link from the article using BeautifulSoup"""
        title_tag = article.find("a", class_="card-title")
        if title_tag:
            return title_tag.get_text(strip=True), title_tag["href"]
        return None, None
    def extract_article_date(self, article):
        """Extract the article publication date from the article using BeautifulSoup."""
        date_div = article.find("div", class_="ac-publishing-date")
        if date_div:
            date_span = date_div.find("span", class_="typography__StyledTypography-sc-owin6q-0 hcIsFR")
            if date_span:
                date_text = date_span.get_text(strip=True).split()
                month, day, year = date_text[0], date_text[1].strip(','), date_text[2]
                # Convert abbreviated month to full month name
                month_fullnames = {
                    'Jan': 'January', 'Feb': 'February', 'Mar': 'March', 'Apr': 'April',
                    'May': 'May', 'Jun': 'June', 'Jul': 'July', 'Aug': 'August',
                    'Sep': 'September', 'Oct': 'October', 'Nov': 'November', 'Dec': 'December'
                }
                month = month_fullnames.get(month, month)
                print (f"month: {month}, day: {day}, year: {year}")
                return {"month": month, "day": day, "year": year}
        return None


    def create_folder_structure(self, year, month, day, data_folder_path):
        """Create a nested folder structure based on the year, month, and day."""
        year_path = os.path.join(data_folder_path, year)
        month_path = os.path.join(year_path, month)
        day_path = os.path.join(month_path, day)
        
        # Create year folder if it doesn't exist
        if not os.path.exists(year_path):
            os.makedirs(year_path)
        
        # Create month folder within the year folder if it doesn't exist
        if not os.path.exists(month_path):
            os.makedirs(month_path)
        
        # Create day folder within the month folder if it doesn't exist
        if not os.path.exists(day_path):
            os.makedirs(day_path)
        
        return day_path

    def save_article_to_file(self, article_title, article_content, article_date, data_folder_path):
        day_folder_path = self.create_folder_structure(article_date['year'], article_date['month'], article_date['day'], data_folder_path)
        file_name = article_title.replace(' ', '_') + '.txt'
        file_path = os.path.join(day_folder_path, file_name)
        index = 0
        while os.path.exists(file_path):
            index += 1
            file_name = f'{article_title.replace(" ", "_")}_{index}.txt'
            file_path = os.path.join(day_folder_path, file_name)
        
        with open(file_path, 'w', encoding='utf-8') as file:  # specify utf-8 encoding here
            file.write(article_content)

    def get_article_body(self, link):
        """Extract and structure the content of the article based on the headings."""
        headers = self.get_request_headers()
        content_text = ""
        try:
            response = requests.get(self.base_url + link, headers=headers)
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                article_body_div = soup.find("div", {"data-module-name": "article-body"})
                if article_body_div:
                    structured_content = self.extract_article_structure(article_body_div)
                    for section in structured_content:
                        content_text += f"\n{section['tag'].upper() if section['tag'] else 'CONTENT'}: {section['heading'] if section['heading'] else ''}\n"
                        content_text += section["text"] + "\n"
        except Exception as e:
            print(f"Error fetching article body: {e}")
        return content_text


    def find_all_articles(self, page_number, data_folder_path, max_retries=5, sleep_duration=10):
        headers = self.get_request_headers()
        retries = 0  # Keep track of retry attempts
    
        while retries < max_retries:
            try:
                response = requests.get(self.tag_base_url + str(page_number) + "/", headers=headers)
                response.raise_for_status()  # Raise an HTTPError for bad responses
    
                soup = BeautifulSoup(response.content, 'html.parser')
                articles = soup.find_all("div", class_="article-cardstyles__StyledWrapper-sc-q1x8lc-0 eJFoEa article-card default")
    
                print(f"Total articles found: {len(articles)}")
    
                for idx, article in enumerate(articles):
                    print(f"\nArticle {idx + 1}:")
    
                    article_date = self.extract_article_date(article)
                    if article_date:
                        print(f"Date: {article_date['month']} {article_date['day']}, {article_date['year']}")
                    
                    article_type = self.get_article_type(article)
                    if article_type:
                        print(f"Article Type: {article_type}")
    
                    article_title, article_link = self.get_article_title_and_link(article)
                    if article_title and article_link:
                        print(f"Article Title: {article_title}\nArticle Link: {self.base_url + article_link}")
    
                    article_content = self.get_article_body(article_link)
                    if article_content:
                        self.save_article_to_file(article_title, article_content, article_date, data_folder_path)
                self.save_last_processed_page(page_number, data_folder_path)
                break  # Break out of the while loop if request is successful
    
            except requests.ConnectionError as e:
                retries += 1
                print(f"Connection error occurred. Retrying in {sleep_duration} seconds. Attempt {retries}/{max_retries}.")
                time.sleep(sleep_duration)
            except Exception as e:
                print(f"An unexpected error occurred: {e}")
                break

    def save_last_processed_page(self,page_number, data_folder_path):
        file_path = os.path.join(data_folder_path, 'last_processed_page.txt')
        with open(file_path, 'w') as file:
            file.write(str(page_number))
    def get_last_processed_page(self,data_folder_path):
        file_path = os.path.join(data_folder_path, 'last_processed_page.txt')
        if os.path.exists(file_path):
            with open(file_path, 'r') as file:
                return int(file.read().strip())
        return None

    def create_data_folder(self):
        # Get the current working directory
        cwd = os.getcwd()
        
        # Define the path for the 'Data' folder
        data_folder_path = os.path.join(cwd, 'Data')
        
        # Check if the folder doesn't exist
        if not os.path.exists(data_folder_path):
            os.makedirs(data_folder_path)
        
        return data_folder_path
    def grab_last_page_number(self):
        try:
            response = requests.get(self.tag_base_url + "1/")
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                page_links = soup.find_all("a", class_="page-link", href=True) # ensure it has href
                last_link = page_links[-2]  # -2 because the last item is the "next" button
                self.last_page = int(last_link.text.strip())
                print(f"Last page number: {self.last_page}")
                return self.last_page
            else:
                print("Failed to retrieve the webpage.")
        except Exception as e:
            print(f"An error occurred: {e}")

    def loop_through_pages(self,data_folder_path, end_page=250):
        if not self.last_page:
            print("Please grab the last page first.")
            return
    
        # Ensure that end_page is not greater than the last page available
        end_page = min(end_page, self.last_page)
        last_processed = self.get_last_processed_page(data_folder_path)
        if last_processed:
            for page in range(last_processed, end_page + 1):
                self.find_all_articles(page,data_folder_path)
            # Add your scraping logic here for each page
            # For example, use requests.get(self.base_url + str(page) + "/") to fetch content
    
            # You can consider adding a delay to be respectful to the server
            # Make sure to import the time module to use sleep
        else:
            for page in range(169, end_page + 1):
                self.find_all_articles(page,data_folder_path)

if __name__ == "__main__":
    
    scraper = CoinDeskScraper()
    data_folder_path = scraper.create_data_folder()
    last_page = scraper.grab_last_page_number()
    #scraper.find_all_articles(1,data_folder_path)
    scraper.loop_through_pages(data_folder_path,last_page)

Last page number: 611
Total articles found: 10

Article 1:
month: December, day: 21, year: 2022
Date: December 21, 2022
Article Type: Finance
Article Title: Bitcoin Miner Core Scientific Files for Bankruptcy, Expects Support From Some Debt Holders
Article Link: https://www.coindesk.com/business/2022/12/21/core-scientific-one-of-the-largest-bitcoin-miners-files-for-bankruptcy-protection/

Article 2:
month: December, day: 21, year: 2022
Date: December 21, 2022
Article Type: Finance
Article Title: Core Scientific to File for Bankruptcy, Continue Mining Through Process: Report
Article Link: https://www.coindesk.com/business/2022/12/21/core-scientific-to-file-for-bankruptcy-continue-mining-through-process-report/

Article 3:
month: December, day: 21, year: 2022
Date: December 21, 2022
Article Type: Markets
Article Title: First Mover Asia: BitDAO’s $100M Token Buyback Plan Gets Mixed Reviews
Article Link: https://www.coindesk.com/markets/2022/12/21/first-mover-asia-bitdaos-100m-token-buyback