In [None]:
import os
import requests
from bs4 import BeautifulSoup
import time
import random

""" 
This script scrapes French-language works from project Gutenberg for a given author.
It filters out works that are likely to be poems, as they are not suitable for this specific project.
"""

AUTHORS = [""] # Add the names of the authors you want to download

def sanitize_filename(filename):
    """
    Sanitize the filename by removing or replacing invalid characters.
    """
    return re.sub(r'[\\/*?:"<>|]', "", filename)

def download_french_books(author_name, folder_path):
    """
    Downloads and saves French-language text files of a given author from Project Gutenberg.
    Filters for 19th-century authors only.
    """
    author_folder = os.path.join(folder_path, sanitize_filename(author_name))
    os.makedirs(author_folder, exist_ok=True)

    search_url = f"https://www.gutenberg.org/ebooks/search/?query={author_name.replace(' ', '+')}&submit_search=Search"
    search_page = requests.get(search_url, headers={'User-Agent': 'chrome'})
    if search_page.status_code != 200:
        print(f"Failed to fetch search results for {author_name}")
        return

    soup = BeautifulSoup(search_page.content, 'html.parser')
    book_links = ["https://www.gutenberg.org" + link['href'] for link in soup.find_all('a', href=True) if link['href'].startswith('/ebooks/')]

    for book_link in book_links:
        book_page = requests.get(book_link, headers={'User-Agent': 'chrome'})
        if book_page.status_code != 200:
            continue

        book_soup = BeautifulSoup(book_page.text, 'html.parser')
       
        metadata = book_soup.find('table', class_='bibrec')
        lang_tag = False
        year_tag = False

        real_title = "Unknown_Title"  
        if metadata:
            for row in metadata.find_all('tr'):
                header = row.find('th')
                data = row.find('td')
                if header and data:
                    if 'Language' in header.text and 'French' in data.text:
                        lang_tag = True
            for title in metadata.find_all('tr'):
                name = title.find('th')
                Titles = title.find('td')
                if name and name.text == 'Title':
                    real_title = Titles.text.strip()
                    print(real_title)
                    break

               
        if not lang_tag :
            continue

        text_link = None
        for link in book_soup.find_all('a', href=True):

            if 'txt.utf-8' in link['href']:
                text_link = "https://www.gutenberg.org" + link['href']
                break

        if text_link:
            text_response = requests.get(text_link)
            if text_response.status_code != 200:
                print(f"Failed to download text for {title}")
                continue

            book_text = text_response.text
            sanitized_title = sanitize_filename(real_title)
            file_path = os.path.join(author_folder, f"{sanitized_title}.txt")
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(book_text)
            print(f"Saved: {file_path}")

        time.sleep(random.uniform(1, 4))

def download_all_authors(folder_path):
    for author in AUTHORS:
        print(f"\nDownloading books for {author}...")
        download_french_books(author, folder_path)

folder_path = "" # Add the path to the folder where you want to save the books
download_all_authors(folder_path)


Downloading books for Catulle Mendès...
International Short Stories: French
Le crime du vieux Blas
Saved: C:/Users/Binesh/Desktop/project_victor-hugo/new_authors\Catulle Mendès\Le crime du vieux Blas.txt
Les oiseaux bleus
Saved: C:/Users/Binesh/Desktop/project_victor-hugo/new_authors\Catulle Mendès\Les oiseaux bleus.txt
Véritables mémoires de Cagliostro
Saved: C:/Users/Binesh/Desktop/project_victor-hugo/new_authors\Catulle Mendès\Véritables mémoires de Cagliostro.txt

Downloading books for Joris-Karl Huysmans...
Against the Grain
Là-bas
A rebours
Saved: C:/Users/Binesh/Desktop/project_victor-hugo/new_authors\Joris-Karl Huysmans\A rebours.txt
The Symbolist Movement in Literature
En route
Saved: C:/Users/Binesh/Desktop/project_victor-hugo/new_authors\Joris-Karl Huysmans\En route.txt
The Cathedral
Short story classics (Foreign), Vol. 5, French II
En ménage
Saved: C:/Users/Binesh/Desktop/project_victor-hugo/new_authors\Joris-Karl Huysmans\En ménage.txt
En Route
Sainte Lydwine de Schiedam
