In [None]:
# Vérifier les ressources disponibles
!nvidia-smi
!df -h
!free -h

# Installation des packages requis
!pip install torch transformers accelerate
!pip install vllm>=0.2.0
!pip install datasets
!pip install faiss-cpu  # Pour la recherche vectorielle
!pip install sentence-transformers

In [None]:
import os
from google.colab import drive

# Monter Google Drive (optionnel, pour sauvegarder)
drive.mount('/content/drive')

# Cloner le repository
!git clone https://github.com/Yuan-Li-FNLP/R3-RAG.git
%cd R3-RAG

# Télécharger un modèle plus petit (choisir selon la RAM disponible)
print("Téléchargement du modèle R3-RAG-CS-Qwen (7B)...")
!git lfs install

# Alternative : utiliser un modèle cold start (plus petit)
!git clone https://huggingface.co/Yuan-Li-FNLP/R3-RAG-CS-Qwen

In [None]:
MODEL_PATH = "/content/R3-RAG/R3-RAG-CS-Qwen"

# Vérifier que le modèle existe
import os
if os.path.exists(MODEL_PATH):
    print(f"✅ Modèle trouvé : {MODEL_PATH}")
    print(f"Taille : {sum(os.path.getsize(os.path.join(MODEL_PATH, f)) for f in os.listdir(MODEL_PATH) if os.path.isfile(os.path.join(MODEL_PATH, f))) / (1024**3):.2f} GB")
else:
    print(f"❌ Modèle non trouvé : {MODEL_PATH}")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Charger le tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

# Charger le modèle avec précision réduite
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

print("✅ Modèle chargé avec succès !")
print(f"Modèle sur device : {model.device}")

30 min

In [None]:
def test_model(question):
    # Format de prompt pour R3-RAG
    prompt = f"""Question: {question}

Let me think step by step and retrieve relevant information.

Action: """

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=200,
            do_sample=True,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response[len(prompt):]

# Test avec une question simple
question = "What is the capital of France?"
response = test_model(question)
print(f"Question: {question}")
print(f"Response: {response}")

In [None]:
def test_model(question):
    # Format de prompt pour R3-RAG
    prompt = f"""Question: {question}

Let me think step by step and retrieve relevant information.

Action: """

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=200,
            do_sample=True,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response[len(prompt):]

3h

In [None]:
# Test avec une question multi-hop
question = "Who is the director of the movie that won the Academy Award for Best Picture in 2020?"
response = test_model(question)
print(f"Question: {question}")
print(f"Response: {response}")