In [38]:
import mwclient
import re
import requests
import os
from typing import List
from dotenv import load_dotenv

"""
This script automates the process of updating Wikimedia Commons files using the Wikimedia Commons API.
It searches for files matching a specific query, checks if they contain a target pattern,
and replaces it with a new pattern if found. The updated files are then saved back to Commons.

Features:
- Searches for files matching the given criteria.
- Logs into Wikimedia Commons using a bot account.
- Checks and updates file descriptions where needed.
- Saves changes with an appropriate edit summary.

Requirements:
- Install `mwclient`, `requests`, and `python-dotenv` via pip if not already installed.
- Ensure you have a Wikimedia bot account with appropriate permissions.
- Store credentials securely in a `.env` file.

Created by Olaf Janssen, Wikimedia coordinator of KB, natioanal library of the Netherlands 
with much help from ChatGPT. 

Latest update: 8 March 2025

License = CC0, public domain.

"""

# Load environment variables from a .env file
load_dotenv()

# Wikimedia Commons login credentials (read from environment variables)
USERNAME = os.getenv("WIKIMEDIA_USERNAME", "")
PASSWORD = os.getenv("WIKIMEDIA_PASSWORD", "")
USER_AGENT = os.getenv("WIKIMEDIA_USER_AGENT", "")

#print(USERNAME,PASSWORD,USER_AGENT)
headers = {'User-Agent': USER_AGENT} # headers are not actually used in this script

if not USERNAME or not PASSWORD:
    raise ValueError("Wikimedia credentials are missing. Please set WIKIMEDIA_USERNAME and WIKIMEDIA_PASSWORD in a .env file or as environment variables.")

# Define the search query URL
# https://commons.wikimedia.org/w/index.php?sort=last_edit_desc&search=%22IA+ddd%22+incategory%3A%22Media+from+Delpher%22+-hastemplate%3A%22Delpher%22&title=Special%3ASearch&profile=advanced&fulltext=1&ns6=1

SEARCH_URL: str = "https://commons.wikimedia.org/w/api.php"
SEARCH_PARAMS: dict = {
    "action": "query",
    "format": "json",
    "list": "search",
    "srsearch": "\"IA ddd\" incategory:\"Media from Delpher\" -hastemplate:\"Delpher\"",
    "srlimit": 500,  # Adjust limit as needed, max=500
    "srnamespace": 6  # Namespace 6 = Files
}

# Define the replacement pattern
OLD_PATTERN: str = r"\}\}\n== \{\{int:license-header\}\} ==\n\{\{PD-old-70-expired\}\}"
NEW_PATTERN: str = r"}}\n{{Delpher}}\n== {{int:license-header}} ==\n{{PD-old-70-expired}}"
maxfiles=5000 # Number of files we want to process in 1 run

def get_files() -> List[str]:
    """Retrieve up to 1000 unique file titles from Wikimedia Commons based on search query."""
    all_files = set()  # Use a set to store unique file titles
    params = SEARCH_PARAMS.copy()  # Make a copy to avoid modifying the global dictionary
    session = requests.Session()

    try:
        while len(all_files) < maxfiles:  # Stop when 5000 unique files are collected
            response = session.get(SEARCH_URL, params=params)
            response.raise_for_status()
            data = response.json()

            # Extract file titles, ensuring uniqueness
            files = {page["title"] for page in data.get("query", {}).get("search", [])}
            all_files.update(files)

            print(f"Retrieved {len(all_files)} unique files so far...")

            # Stop if no more results are available
            if "continue" not in data:
                break

            # Update the continue parameter for the next request
            params.update(data["continue"])

    except requests.RequestException as e:
        print(f"Error fetching file list: {e}")

    print(f"Final count: {len(all_files)} unique files retrieved.")
    print(list(all_files))
    return list(all_files) #[:maxfiles]  # Ensure exactly 1000 unique files are returned if possible


def login_to_commons() -> mwclient.Site:
    """Log in to Wikimedia Commons using mwclient."""
    try:
        site = mwclient.Site("commons.wikimedia.org")
        site.login(USERNAME, PASSWORD)
        return site
    except mwclient.LoginError as e:
        print(f"Login failed: {e}")
        raise
    except Exception as e:
        print(f"Unexpected error during login: {e}")
        raise
        
def process_files(site: mwclient.Site, file_titles: List[str]) -> None:
    """Check each file's wikitext and replace the target pattern if found.
    
    Args:
        site (mwclient.Site): The authenticated Wikimedia Commons site instance.
        file_titles (List[str]): A list of file titles to process.
    
    Returns:
        None
    """

    count = 0 # in case items is empty and you need it after the loop
    for count, title in enumerate(file_titles, start=1):
        try:
            page = site.pages[title]
            text = page.text()
            
            if re.search(OLD_PATTERN, text):
                new_text = re.sub(OLD_PATTERN, NEW_PATTERN, text)
                page.save(new_text, summary="Added {{Delpher}} template to file description using custom PAWS-botscript ")
                print(f"  {count} - Updated: {title}")
            else:
                print(f"Search pattern not found in this file: {title}")
        except Exception as e:
            print(f"Error processing {title}: {e}")

def main() -> None:
    """Main function to orchestrate file retrieval, login, and processing."""
    file_titles = get_files()
    if not file_titles:
        print("No files found or error retrieving files.")
        return
    
    try:
        site = login_to_commons()
        process_files(site, file_titles)
        print(f"Finished, all {len(file_titles)} files processed.")
    except Exception as e:
        print(f"Script terminated due to error: {e}")

if __name__ == "__main__":
    main()


Retrieved 500 unique files so far...
Retrieved 998 unique files so far...
Retrieved 1278 unique files so far...
Retrieved 1652 unique files so far...
Retrieved 2102 unique files so far...
Retrieved 2602 unique files so far...
Retrieved 2836 unique files so far...
Retrieved 3147 unique files so far...
Retrieved 3516 unique files so far...
Retrieved 4006 unique files so far...
Retrieved 4454 unique files so far...
Retrieved 4789 unique files so far...
Retrieved 5223 unique files so far...
Final count: 5223 unique files retrieved.
['File:Middelburgsche courant 12-10-1841 (IA ddd 010275505 mpeg21).pdf', 'File:Leeuwarder courant 17-02-1809 (IA ddd 010578997 mpeg21).pdf', 'File:Bredasche courant 10-12-1848 (IA ddd 010174446 mpeg21).pdf', 'File:De Tĳd - godsdienstig-staatkundig dagblad 01-03-1852 (IA ddd 010247674 mpeg21).pdf', 'File:Middelburgsche courant 15-02-1840 (IA ddd 010275249 mpeg21).pdf', 'File:Rotterdamsche courant 23-01-1816 (IA ddd 010390226 mpeg21).pdf', 'File:De Tĳd - godsdiens