In [1]:
import os
import requests
from bs4 import BeautifulSoup
import time
import random


""" 
This script scrapes French-language works from Wikisource for a given author.
It filters out works that are likely to be poems, as they are not suitable for this specific project.
"""
def sanitize_filename(title):
    return "_".join(title.split()).replace("/", "_").replace("\\", "_")

def download_french_works_from_wikisource(author_name, folder_path):
    base_url = "https://fr.wikisource.org"
    author_url = f"{base_url}/wiki/Auteur:{author_name.replace(' ', '_')}"
    
    try:
        response = requests.get(author_url)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"Error fetching author page {author_name}: {e}")
        return
    
    soup = BeautifulSoup(response.text, 'html.parser')
    author_folder = os.path.join(folder_path, sanitize_filename(author_name))
    os.makedirs(author_folder, exist_ok=True)
    
    work_links = []
    for link in soup.select("div.mw-parser-output ul li a"):  # Adjusted for actual structure
        title = link.text.strip()
        href = link['href']
        if "poésies" not in title.lower() and "poème" not in title.lower():
            work_links.append((title, base_url + href))
    
    for title, work_url in work_links:
        try:
            work_response = requests.get(work_url)
            work_response.raise_for_status()
        except requests.RequestException as e:
            print(f"Error fetching work {title}: {e}")
            continue
        
        work_soup = BeautifulSoup(work_response.text, 'html.parser')
        text_container = work_soup.find("div", class_="mw-parser-output")
        if not text_container:
            print(f"No valid text content found for {title}")
            continue
        
        text_content = "\n".join([p.get_text() for p in text_container.find_all("p") if p.get_text().strip()])
        if not text_content.strip():
            print(f"Skipping empty content for {title}")
            continue
        
        file_path = os.path.join(author_folder, f"{sanitize_filename(title)}.txt")
        
        try:
            with open(file_path, "w", encoding="utf-8") as f:
                f.write(text_content)
            print(f"Saved: {file_path}")
        except IOError as e:
            print(f"Error saving file {file_path}: {e}")
        
        time.sleep(random.uniform(1, 3))

def scrape_wikisource_authors(authors, folder_path):
    for author in authors:
        print(f"\nScraping works of {author}...")
        download_french_works_from_wikisource(author, folder_path)

authors = [''] # Add authors here
folder_path = "" # Add folder path here
scrape_wikisource_authors(authors, folder_path)


Scraping works of ...
Error fetching author page : 404 Client Error: Not Found for url: https://fr.wikisource.org/wiki/Auteur:
