# Projet : Construire un RAG basé sur les données Minecraft de Wikipédia

Ce projet vise à développer un système de **Retrieval-Augmented Generation (RAG)** qui tire parti des informations disponibles sur le Wikipédia de Minecraft. Le pipeline comprend plusieurs étapes, allant de l'importation des bibliothèques nécessaires jusqu'à la création d'une base de données vectorielle et la mise en place d'un système de requêtes utilisant un modèle de language.

---

## Étapes du Projet

### 1. Importer les bibliothèques nécessaires

Pour commencer, nous devrons importer les bibliothèques Python indispensables pour manipuler les données, effectuer les requêtes, construire une base de données vectorielle et interagir avec le modèle de langage.


In [None]:
import requests
import re
import ollama
from bs4 import BeautifulSoup
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.chat_models import ChatOllama
from langchain_community.vectorstores import Chroma

### 2. Récupérer les données de Wikipédia sur Minecraft

Dans cette étape, nous établirons une connexion avec l'API de Wikipédia pour récupérer les données pertinentes sur Minecraft.


In [None]:
# Define the base URL for the API
url = "https://fr.minecraft.wiki/api.php"

# list des categories contenat des info jugé utile.
categorys = ["Bloc","Environnement","Gameplay","Objets","Redstone","Entitée"]

# Function to fetch category members
def fetch_category_members(category, limit=500):
    members = []
    cmcontinue = None

    while True:
        # API request parameters
        params = {
            "action": "query",
            "list": "categorymembers",
            "cmtitle": f"Catégorie:{category}",
            "cmlimit": limit,
            "format": "json",
        }
        if cmcontinue:
            params["cmcontinue"] = cmcontinue

        # Make the API request
        headers = {"User-Agent": "MyScript/1.0 (myemail@example.com)"}
        response = requests.get(url, params=params, headers=headers)
        data = response.json()

        # Collect members
        members.extend(data.get("query", {}).get("categorymembers", []))

        # Check if more pages are available
        cmcontinue = data.get("continue", {}).get("cmcontinue")
        if not cmcontinue:
            break
    return members

def fetch_rvid(title, date="2021-01-01T00:00:00.000Z"):
    # API request parameters
    params = {
        "action": "query",
        "format": "json",
        "prop": "revisions",
        "titles": title,
        "formatversion": "2",
        "rvprop": "ids",
        "rvlimit": "1",
        "rvstart": date,
        "rvdir": "older"
    }

    # Make the API request
    headers = {"User-Agent": "MyScript/1.0 (myemail@example.com)"}
    response = requests.get("https://fr.minecraft.wiki/api.php", params=params, headers=headers)

    # Check if the response was successful
    if response.status_code == 200:
        data = response.json()
        
        # Extract the revisions data safely
        pages = data.get("query", {}).get("pages", [])
        if pages:
            page_info = pages[0]
            revisions = page_info.get("revisions", [])
            
            if revisions:
                # Return the revision ID if found
                return revisions[0].get("revid", None)
    
    # Return None if no revisions are found or if there's an issue
    return None

def extract_table_description(table):

    # Extraire les ingrédients
    ingredients_cell = table.find('td')
    ingredients = ingredients_cell.get_text().replace('+', ', ')
    position = ingredients.rfind(", ")
    if position != -1:
        ingredients = ingredients[:position] + " et " + ingredients[position + 1:]
    
    
    # Extraire le produit final en filtrant le texte
    output_image = table.find('span', class_='mcui-output').find('img')
    alt_text = output_image.get('alt')
    
    # Si "alt" contient des détails inutiles comme "Invicon Target.png", nettoyer le texte
    output_name = alt_text.split(":")[-1].strip()  # Extraire après les deux-points
    
    # Générer une description textuelle propre
    description = f"Pour fabriquer une {output_name}, utilisez les ingrédients suivants : {ingredients}."
    return description

def fetch_page_content(title, date="2021-01-01T00:00:00.000Z"):
    rvid = fetch_rvid(title,date)
    if not rvid:
        return None
    page_url = f"https://fr.minecraft.wiki/w/{title.replace(' ', '_')}?oldid={rvid}"

    print(f"collecting data from : {page_url}")
    
    headers = {"User-Agent": "MyScript/1.0 (myemail@example.com)"}
    response = requests.get(page_url, headers=headers)
    
    if response.status_code != 200:
        print(f"Error: Unable to fetch the page. Status code {response.status_code}")
        return None
    
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    soup.prettify(formatter="html5")
    
    paragraphs = soup.select('#mw-content-text > div.mw-parser-output > p')
    paragraph_texts = [p.get_text() for p in paragraphs if p.get_text() if not p.get_text().strip().endswith(":")]

    fabrications = soup.select('#mw-content-text table[data-description="Fabrication"]')
    paragraph_texts += [extract_table_description(f) for f in fabrications]
    
    return paragraph_texts


### 3. Extraire et traiter les données récupérées

In [None]:
# obtenir la liste des page 
categorys_page = {str : fetch_category_members(str) for str in categorys}

In [None]:
for i, paragraph in enumerate(fetch_page_content("Fabrication"), start=1):
        print(f"Paragraph {i}:\n{paragraph}\n")

### 4. Construire une base de données vectorielle

In [None]:
def split_wiki():
    chunks = []
    for page in categorys_page:
        print(f"loading page : {page}")
        chunks += fetch_page_content(page)
    return chunks


chunks = split_wiki()
print(f"Total number of chunks: {len(chunks)}")
print("\n")
embeddings = OllamaEmbeddings(model="embedding-model-here") #Replace model with your preferred embedding model

db = Chroma.from_documents(chunks, embeddings, collection_name = "local-rag")

### 5. Mettre en place le système de requêtes au modèle

In [None]:



# Initialize the client
client = ollama.Client()

# Define the model and prompt
model_name = "qwen2.5:0.5b"




scraped_text = fetch_page_content(categorys_page["Bloc"][1]["title"])

# Prompt for cleaning text
prompt = f"""
Clean and structure the following text to make it readable and coherent. 
Ensure that the output stays true to the original content, without introducing new information, changing the original meaning, or adding extraneous details. 
Only adjust grammar, punctuation, and structure for clarity. 
Do not add or modify any content. 
Keep technical terms intact. 
Here is the text:

{scraped_text}
"""

# Generate response
response = client.generate(model=model_name, prompt=prompt, options={"temperature":0.1})

# Print the cleaned text
print("Cleaned Text:")
print(response["response"])