In [1]:
from ragatouille import RAGTrainer

trainer = RAGTrainer(
    model_name="JerryColBERT",
    pretrained_model_name="colbert-ir/colbertv2.0",
    language_code="en",
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import requests


def get_wikipedia_page(title: str):
    """
    Retrieve the full text content of a Wikipedia page.

    :param title: str - Title of the Wikipedia page.
    :return: str - Full text content of the page as raw string.
    """
    # Wikipedia API endpoint
    URL = "https://en.wikipedia.org/w/api.php"

    # Parameters for the API request
    params = {
        "action": "query",
        "format": "json",
        "titles": title,
        "prop": "extracts",
        "explaintext": True,
    }

    # Custom User-Agent header to comply with Wikipedia's best practices
    headers = {"User-Agent": "RAGatouille_tutorial/0.0.1 (ben@clavie.eu)"}

    response = requests.get(URL, params=params, headers=headers)
    data = response.json()

    # Extracting page content
    page = next(iter(data["query"]["pages"].values()))
    return page["extract"] if "extract" in page else None

In [3]:
my_full_corpus = [get_wikipedia_page("Hayao_Miyazaki")]

In [5]:
from ragatouille.data import CorpusProcessor, llama_index_sentence_splitter

corpus_processor = CorpusProcessor(
    document_splitter_fn=llama_index_sentence_splitter)
documents = corpus_processor.process_corpus(my_full_corpus, chunk_size=256)

In [6]:
import random

queries = [
    "What manga did Hayao Miyazaki write?",
    "which film made ghibli famous internationally",
    "who directed Spirited Away?",
] * 3

pairs = []
for query in queries:
    fake_relevant_docs = random.sample(documents, 10)
    for doc in fake_relevant_docs:
        pairs.append((query, doc))

In [11]:
trainer.prepare_training_data(
    raw_data=pairs,
    data_out_path="./data/",
    all_documents=my_full_corpus,
    num_new_negatives=10,
    mine_hard_negatives=True,
)

Loading Hard Negative SimpleMiner dense embedding model BAAI/bge-small-en-v1.5...




Building hard negative index for 51 documents...
All documents embedded, now adding to index...
save_index set to False, skipping saving hard negative index
Hard negative index generated


'./data/'