In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import os
import random
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Create a data directory if it doesn't exist
os.makedirs('data', exist_ok=True)

# Set up retries for failed requests
session = requests.Session()
retries = Retry(total=3, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
session.mount('http://', HTTPAdapter(max_retries=retries))
session.mount('https://', HTTPAdapter(max_retries=retries))

def scrape_lovecraft_content(content_type):
    """
    Scrapes H.P. Lovecraft's writings based on the content type specified (fiction, poetry, essays, letters).
    
    Parameters:
        content_type (str): The type of content to scrape ('fiction', 'poetry', 'essays', 'letters').
    """
    # Base URL
    base_url = "https://www.hplovecraft.com/writings/texts/"
    response = session.get(base_url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        content_links = []

        # Find all relevant links
        for link in soup.find_all('a', href=True):
            href = link['href']
            if href.startswith(f'{content_type}/') and not href.startswith('#'):
                full_url = f"{base_url}{href}"
                content_links.append(full_url)

        # Open CSV file to write
        csv_filename = f'data/lovecraft_{content_type}.csv'
        with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
            csvwriter = csv.writer(csvfile)
            csvwriter.writerow(['Content Type', 'Title', 'Text'])  # Add content type column

            for content_url in content_links:
                time.sleep(random.uniform(1, 3))  # Randomize throttle time between 1-3 seconds
                try:
                    headers = {'User-Agent': 'Mozilla/5.0'}
                    content_response = session.get(content_url, headers=headers)

                    if content_response.status_code == 200:
                        content_soup = BeautifulSoup(content_response.content, 'html.parser')
                        title_tag = content_soup.find('font', size="+2")  # Title font size
                        author_tag = content_soup.find('font', size="+1")  # Author font size
                        text_div = content_soup.find('div', align='justify')  # Update as necessary

                        # Validate title and text
                        if title_tag and text_div:
                            title = f"{title_tag.get_text(strip=True)} by {author_tag.get_text(strip=True)}"
                            csvwriter.writerow([content_type, title, text_div.get_text(strip=True)])
                            print(f'Scraped: {title}')
                        else:
                            print(f'Title or text not found for {content_url}')
                    else:
                        print(f'Failed to scrape {content_url}: {content_response.status_code}')
                except Exception as e:
                    print(f'Error scraping {content_url}: {e}')
    else:
        print(f'Failed to access the base URL: {response.status_code}')

# Scrape different types of content
for content in ['fiction', 'poetry', 'essays', 'letters']:
    scrape_lovecraft_content(content)


Scraped: The AlchemistBy H. P. Lovecraft by By H. P. Lovecraft
Scraped: AshesBy C. M. Eddy, Jr.with H. P. Lovecraft by By C. M. Eddy, Jr.with H. P. Lovecraft
Scraped: At the Mountains of MadnessBy H. P. Lovecraft by By H. P. Lovecraft
Scraped: AzathothBy H. P. Lovecraft by By H. P. Lovecraft
Scraped: The Battle that Ended the Century(MS. Found in a Time Machine)By R. H. Barlowwith H. P. Lovecraft by (MS. Found in a Time Machine)By R. H. Barlowwith H. P. Lovecraft
Scraped: The Beast in the CaveBy H. P. Lovecraft by By H. P. Lovecraft
Scraped: Beyond the Wall of SleepBy H. P. Lovecraft by By H. P. Lovecraft
Scraped: The BookBy H. P. Lovecraft by By H. P. Lovecraft
Scraped: The Call of CthulhuBy H. P. Lovecraft by By H. P. Lovecraft
Scraped: The Case of Charles Dexter WardBy H. P. Lovecraft by By H. P. Lovecraft
Scraped: The Cats of UltharBy H. P. Lovecraft by By H. P. Lovecraft
Scraped: CelephaïsBy H. P. Lovecraft by By H. P. Lovecraft
Scraped: The Challenge from BeyondBy C.L. Moore, A. 