# Minecraft Wiki Loader Notebook
This notebook demonstrates how to fetch and process data from the French Minecraft Wiki using Python. It contains various functions and a custom loader class for extracting and processing wiki data.

## Importing Necessary Libraries
The libraries used in this notebook include:
- `requests` for making HTTP requests
- `BeautifulSoup` for parsing HTML content
- Classes from `langchain_core` for handling documents and loaders

In [None]:
from typing import AsyncIterator, Iterator
import requests
from bs4 import BeautifulSoup

from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document

## Fetching Revision IDs
The `fetch_rvid` function retrieves the revision ID of a wiki page as of a specified date. This is essential for accessing historical content.

In [None]:
def fetch_rvid(title, date="2021-01-01T00:00:00.000Z"):
    # API request parameters
    params = {
        "action": "query",
        "format": "json",
        "prop": "revisions",
        "titles": title,
        "formatversion": "2",
        "rvprop": "ids",
        "rvlimit": "1",
        "rvstart": date,
        "rvdir": "older"
    }

    # Make the API request
    headers = {"User-Agent": "MyScript/1.0 (myemail@example.com)"}
    response = requests.get("https://fr.minecraft.wiki/api.php", params=params, headers=headers)

    # Check if the response was successful
    if response.status_code == 200:
        data = response.json()
        
        # Extract the revisions data safely
        pages = data.get("query", {}).get("pages", [])
        if pages:
            page_info = pages[0]
            revisions = page_info.get("revisions", [])
            
            if revisions:
                # Return the revision ID if found
                return revisions[0].get("revid", None)
    
    # Return None if no revisions are found or if there's an issue
    return None

## Fetching Category Members
The `fetch_category_members` function retrieves the titles of all pages within a specified category. It handles pagination if the number of pages exceeds the API's limit.

In [None]:
def fetch_category_members(category, limit=500):
    members = []
    cmcontinue = None

    while True:
        # API request parameters
        params = {
            "action": "query",
            "list": "categorymembers",
            "cmtitle": f"Catégorie:{category}",
            "cmlimit": limit,
            "format": "json",
        }
        if cmcontinue:
            params["cmcontinue"] = cmcontinue

        # Make the API request
        headers = {"User-Agent": "MyScript/1.0 (myemail@example.com)"}
        response = requests.get("https://fr.minecraft.wiki/api.php", params=params, headers=headers)
        data = response.json()

        # Collect members
        members.extend([page["title"] for page in data.get("query", {}).get("categorymembers", [])])

        

        # Check if more pages are available
        cmcontinue = data.get("continue", {}).get("cmcontinue")
        if not cmcontinue:
            break
    return members

## Fetching Page Content
The `fetch_page_content` function fetches the HTML content of a wiki page at a specific revision. It extracts and prettifies the content using `BeautifulSoup`.

In [None]:
def fetch_page_content(title, date="2021-01-01T00:00:00.000Z"):
    rvid = fetch_rvid(title,date)
    if not rvid:
        return None
    page_url = f"https://fr.minecraft.wiki/w/{title.replace(' ', '_')}?oldid={rvid}"

    print(f"collecting data from : {page_url}")
    
    headers = {"User-Agent": "MyScript/1.0 (myemail@example.com)"}
    response = requests.get(page_url, headers=headers)
    
    if response.status_code != 200:
        print(f"Error: Unable to fetch the page. Status code {response.status_code}")
        return None
    
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    soup.prettify(formatter="html5")
    
    return soup.select_one('#mw-content-text')

## Text Processing Functions
These utility functions determine whether a paragraph is worth extracting (`is_textable`) and clean up the text (`textify`).

In [None]:
def is_textable(p):
    return  (p.get_text() and p.get_text().replace('\n', ' ').strip() != "" and  # skip empty <p>
            (not p.get_text().strip().endswith(":")) and # skip <p> follawed by array or image
            (not p.get_text().strip().startswith("Erreur")) and # skip error message <p>
            len( p.get_text().strip()) > 30 # skip things like "Alambic/BS" witch is useless
        )

def textify(p):
    text = p.get_text().replace('\n', ' ').replace('  ', ' ')
    return text

## Loader Class
The `MinecraftWikiLoader` class processes wiki pages within specified categories. It lazily loads paragraphs as `Document` objects for further use.

In [None]:
class MincraftWikiLoader(BaseLoader):

    def __init__(self, categorys = ["Bloc","Environnement","Gameplay","Objets","Redstone","Entitée"]) -> None:
        """Initialize the loader
        """
        self.categorys_page = []
        for str in categorys:
            self.categorys_page += fetch_category_members(str)
        self.page_count = len(self.categorys_page)

    def lazy_load(self) -> Iterator[Document]:  # <-- Does not take any arguments
        """A lazy loader that reads a page <p> by <p>
        """
        for page in self.categorys_page:
            page_content = fetch_page_content(page)
            paragraphs = []
            if page_content:
                paragraphs = page_content.select('div.mw-parser-output > p')
            paragraph_number = 0
            for p in paragraphs:
                if is_textable(p):
                    yield Document(
                        page_content=textify(p),
                        metadata={"paragraph_number": paragraph_number, "source": f"https://fr.minecraft.wiki/w/{page}"},
                    )
                    paragraph_number += 1

## Loader Initialization and Testing
Here, the `MinecraftWikiLoader` is initialized and tested by collecting the first 20 paragraphs from the specified categories.

In [None]:
loader =  MincraftWikiLoader()

In [None]:
paragraphs = []
for document in loader.lazy_load():
    if len(paragraphs) < 20:  # Stop after collecting three paragraphs
        paragraphs.append(document)
    else:
        break

# Print the collected paragraphs
for paragraph in paragraphs:
    print(paragraph)

