In [1]:
import os
import uuid
import json
import asyncio
import re
import tqdm
import torch
import nest_asyncio
from typing import List, Dict
from getpass import getpass
from dotenv import load_dotenv

from document_loader import load_document_with_unstructured, split_document_with_unstructured
from embedding_models import SnowflakeArcticEmbedModel
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from sentence_transformers import SentenceTransformer, InputExample
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from torch.utils.data import DataLoader
from huggingface_hub import login


  from .autonotebook import tqdm as notebook_tqdm
2025-05-12 14:38:44,195 - INFO - PyTorch version 2.7.0 available.


In [3]:
# Apply nest_asyncio to allow asynchronous code
nest_asyncio.apply()

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else 
                     ("mps" if torch.backends.mps.is_available() else "cpu"))
print(f"Device used: {device}")

# Configuration
BATCH_SIZE = 2
EPOCHS = 5
TOTAL_EXAMPLES = 150  # Total number of examples desired
TRAIN_RATIO = 0.7     # 105 examples
VAL_RATIO = 0.15      # 22 examples
TEST_RATIO = 0.15     # 23 examples
# Load env
load_dotenv()

# Request token interactively
hf_token = getpass("Enter your Hugging Face token: ")


Device used: mps


In [4]:
def load_documents(pdf_path: str) -> List[Document]:
    """Charge un document PDF à partir d'un chemin spécifique"""
    documents = []
    if os.path.isfile(pdf_path) and pdf_path.endswith('.pdf'):
        doc_pages = load_document_with_unstructured(pdf_path)
        documents.extend(doc_pages)
    elif os.path.isdir(pdf_path):
        for filename in os.listdir(pdf_path):
            if filename.endswith('.pdf'):
                file_path = os.path.join(pdf_path, filename)
                doc_pages = load_document_with_unstructured(file_path)
                documents.extend(doc_pages)
    else:
        raise FileNotFoundError(f"Le chemin spécifié n'existe pas ou n'est pas un fichier PDF: {pdf_path}")
    return documents


def prepare_documents(documents: List[Document]) -> List[Document]:
    """Split documents into chunks and add unique IDs"""
    # Split documents
    chunked_documents = split_document_with_unstructured(documents)
    
    # Add unique IDs
    id_set = set()
    for document in chunked_documents:
        doc_id = str(uuid.uuid4())
        while doc_id in id_set:
            doc_id = str(uuid.uuid4())
        id_set.add(doc_id)
        document.metadata["id"] = doc_id
    
    return chunked_documents


async def create_questions(documents, n_questions_per_doc=2):
    """Generate questions for each document chunk using ChatGPT"""
    
    # Question generation model configuration
    qa_chat_model = ChatOpenAI(model="gpt-4o-mini", temperature=0)
    
    # Create prompt template for generating questions
    qa_prompt = """\
    Given the following context about puppy care and training, generate questions that an owner might ask about this information.
    
    You should generate {n_questions} questions that must be presented in the following format:
    
    1. QUESTION #1
    2. QUESTION #2
    ...
    
    Context:
    {context}
    """
    qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)
    question_generation_chain = qa_prompt_template | qa_chat_model
    
    questions = {}
    relevant_docs = {}
    
    progress_bar = tqdm.tqdm(total=len(documents), desc="Generating questions for documents")
    
    for document in documents:
        context = document.page_content
        doc_id = document.metadata["id"]
        
        try:
            chain_output = await question_generation_chain.ainvoke({
                "context": context,
                "n_questions": n_questions_per_doc
            })
            
            generated_questions = chain_output.content
            question_lines = generated_questions.strip().split("\n")
            
            for q_line in question_lines:
                if not q_line.strip():
                    continue
                
                q_text = re.sub(r'^\d+\.\s+', '', q_line.strip())
                q_id = str(uuid.uuid4())
                
                questions[q_id] = q_text
                relevant_docs[q_id] = [doc_id]
        except Exception as e:
            print(f"Error generating questions for document {doc_id}: {e}")
        
        progress_bar.update(1)
    
    progress_bar.close()
    return questions, relevant_docs


def split_dataset(documents, questions, relevant_docs, total_examples=150):
    """Split the dataset into training, validation, and test sets"""
    # Calculate number of examples per split
    n_train = int(total_examples * TRAIN_RATIO)
    n_val = int(total_examples * VAL_RATIO)
    n_test = total_examples - n_train - n_val
    
    # Get question IDs
    question_ids = list(questions.keys())
    
    # Make sure we have enough questions
    if len(question_ids) < total_examples:
        print(f"Warning: Only {len(question_ids)} questions available, less than the requested {total_examples}")
        n_train = int(len(question_ids) * TRAIN_RATIO)
        n_val = int(len(question_ids) * VAL_RATIO)
        n_test = len(question_ids) - n_train - n_val
    else:
        # Truncate to total_examples
        question_ids = question_ids[:total_examples]
    
    # Split question IDs
    train_ids = question_ids[:n_train]
    val_ids = question_ids[n_train:n_train+n_val]
    test_ids = question_ids[n_train+n_val:]
    
    # Function to filter dataset by question IDs
    def filter_dataset(ids):
        filtered_questions = {q_id: questions[q_id] for q_id in ids}
        filtered_relevant_docs = {q_id: relevant_docs[q_id] for q_id in ids}
        
        # Get all document IDs needed for this split
        doc_ids = set()
        for rel_docs in filtered_relevant_docs.values():
            doc_ids.update(rel_docs)
        
        # Filter documents
        filtered_corpus = {}
        for doc in documents:
            if doc.metadata["id"] in doc_ids:
                filtered_corpus[doc.metadata["id"]] = doc.page_content
        
        return {
            "questions": filtered_questions,
            "relevant_contexts": filtered_relevant_docs,
            "corpus": filtered_corpus
        }
    
    # Create splits
    train_dataset = filter_dataset(train_ids)
    val_dataset = filter_dataset(val_ids)
    test_dataset = filter_dataset(test_ids)
    
    print(f"Sets created - Train: {len(train_dataset['questions'])} questions, Val: {len(val_dataset['questions'])} questions, Test: {len(test_dataset['questions'])} questions")
    
    return train_dataset, val_dataset, test_dataset


def fine_tune_embedding_model(train_dataset, val_dataset, model_name="Snowflake/snowflake-arctic-embed-l"):
    """Fine-tune the embedding model"""
    # Load base model
    model = SentenceTransformer(model_name)
    model = model.to(device)
    print(f"Model loaded and transferred to {device}")
    
    # Prepare training examples
    corpus = train_dataset['corpus']
    queries = train_dataset['questions']
    relevant_docs = train_dataset['relevant_contexts']
    
    examples = []
    for query_id, query in queries.items():
        doc_id = relevant_docs[query_id][0]
        text = corpus[doc_id]
        example = InputExample(texts=[query, text])
        examples.append(example)
    
    # Prepare data loader
    loader = DataLoader(examples, batch_size=BATCH_SIZE)
    
    # Configure loss function - MatryoshkaLoss for multi-dimensional embeddings
    matryoshka_dimensions = [1024, 512, 256, 128, 64]  # Adjust according to your model
    inner_train_loss = MultipleNegativesRankingLoss(model)
    train_loss = MatryoshkaLoss(
        model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
    )
    
    # Configure evaluator
    val_corpus = val_dataset['corpus']
    val_queries = val_dataset['questions']
    val_relevant_docs = val_dataset['relevant_contexts']
    evaluator = InformationRetrievalEvaluator(val_queries, val_corpus, val_relevant_docs)
    
    # Calculate warm-up steps
    warmup_steps = int(len(loader) * EPOCHS * 0.1)
    
    # Fine-tune the model
    model.fit(
        train_objectives=[(loader, train_loss)],
        epochs=EPOCHS,
        warmup_steps=warmup_steps,
        output_path='puppy_finetuned_embeddings',
        show_progress_bar=True,
        evaluator=evaluator,
        evaluation_steps=50
    )
    
    return model


def push_to_huggingface(model, username, model_name_prefix="puppy-embed"):
    """Push the fine-tuned model to Hugging Face Hub"""
    # Generate unique model name
    model_name = f"{username}/{model_name_prefix}-{str(uuid.uuid4())[:8]}"
    
    # Push to Hub
    model.push_to_hub(model_name)
    print(f"Model pushed to Hugging Face Hub: {model_name}")
    
    return model_name



In [None]:
OUTPUT_DIR="puppy_finetuned_embeddings"
HF_USERNAME="JTh34"
BASE_MODEL="Snowflake/snowflake-arctic-embed-l"
PDF_PATH = "data/BD_PuppiesForDummies.pdf"
# 1. Load documents
print("Loading documents...")
raw_documents = load_documents(PDF_PATH)
print(f"Loaded {len(raw_documents)} pages of raw documents")

# 2. Prepare documents
print("Preparing documents...")
chunked_documents = prepare_documents(raw_documents)
print(f"Created {len(chunked_documents)} document chunks")

# 3. Generate questions
print("Generating questions...")
questions, relevant_docs = await create_questions(chunked_documents)
print(f"Generated {len(questions)} questions")

# 4. Split dataset
print("Splitting dataset...")
train_dataset, val_dataset, test_dataset = split_dataset(
    chunked_documents, questions, relevant_docs, TOTAL_EXAMPLES)

In [None]:

# Save datasets
os.makedirs(OUTPUT_DIR, exist_ok=True)
with open(os.path.join(OUTPUT_DIR, "train_dataset.json"), "w") as f:
    json.dump(train_dataset, f)
with open(os.path.join(OUTPUT_DIR, "val_dataset.json"), "w") as f:
    json.dump(val_dataset, f)
with open(os.path.join(OUTPUT_DIR, "test_dataset.json"), "w") as f:
    json.dump(test_dataset, f)
print(f"Datasets saved in {OUTPUT_DIR}")

In [5]:
OUTPUT_DIR="puppy_finetuned_embeddings"
HF_USERNAME="JTh34"
BASE_MODEL="Snowflake/snowflake-arctic-embed-l"
PDF_PATH = "data/BD_PuppiesForDummies.pdf"
# recharger les datasets
with open(os.path.join(OUTPUT_DIR, "train_dataset.json"), "r") as f:
    train_dataset = json.load(f)
with open(os.path.join(OUTPUT_DIR, "val_dataset.json"), "r") as f:
    val_dataset = json.load(f)
with open(os.path.join(OUTPUT_DIR, "test_dataset.json"), "r") as f:
    test_dataset = json.load(f)

In [7]:
# 5. Fine-tune the model
print("Fine-tuning the model...")
model = fine_tune_embedding_model(train_dataset, val_dataset, BASE_MODEL)


2025-05-12 14:41:34,103 - INFO - Use pytorch device_name: mps
2025-05-12 14:41:34,106 - INFO - Load pretrained SentenceTransformer: Snowflake/snowflake-arctic-embed-l


Fine-tuning the model...


2025-05-12 14:41:39,894 - INFO - 1 prompts are loaded, with the keys: ['query']


Model loaded and transferred to mps


                                                                     

Step,Training Loss,Validation Loss,Cosine Accuracy@1,Cosine Accuracy@3,Cosine Accuracy@5,Cosine Accuracy@10,Cosine Precision@1,Cosine Precision@3,Cosine Precision@5,Cosine Precision@10,Cosine Recall@1,Cosine Recall@3,Cosine Recall@5,Cosine Recall@10,Cosine Ndcg@10,Cosine Mrr@10,Cosine Map@100
50,No log,No log,0.545455,0.818182,0.909091,1.0,0.545455,0.272727,0.181818,0.1,0.545455,0.818182,0.909091,1.0,0.775166,0.702273,0.702273
53,No log,No log,0.545455,0.818182,0.909091,1.0,0.545455,0.272727,0.181818,0.1,0.545455,0.818182,0.909091,1.0,0.775166,0.702273,0.702273
100,No log,No log,0.590909,0.772727,0.909091,1.0,0.590909,0.257576,0.181818,0.1,0.590909,0.772727,0.909091,1.0,0.780848,0.711364,0.711364
106,No log,No log,0.681818,0.772727,0.909091,1.0,0.681818,0.257576,0.181818,0.1,0.681818,0.772727,0.909091,1.0,0.819311,0.763312,0.763312
150,No log,No log,0.681818,0.863636,0.954545,1.0,0.681818,0.287879,0.190909,0.1,0.681818,0.863636,0.954545,1.0,0.843933,0.793182,0.793182
159,No log,No log,0.681818,0.863636,0.954545,1.0,0.681818,0.287879,0.190909,0.1,0.681818,0.863636,0.954545,1.0,0.843933,0.793182,0.793182
200,No log,No log,0.727273,0.863636,0.954545,1.0,0.727273,0.287879,0.190909,0.1,0.727273,0.863636,0.954545,1.0,0.860709,0.815909,0.815909
212,No log,No log,0.727273,0.863636,0.954545,1.0,0.727273,0.287879,0.190909,0.1,0.727273,0.863636,0.954545,1.0,0.86666,0.823485,0.823485
250,No log,No log,0.727273,0.863636,0.954545,1.0,0.727273,0.287879,0.190909,0.1,0.727273,0.863636,0.954545,1.0,0.86666,0.823485,0.823485
265,No log,No log,0.727273,0.863636,0.954545,1.0,0.727273,0.287879,0.190909,0.1,0.727273,0.863636,0.954545,1.0,0.86666,0.823485,0.823485


2025-05-12 14:43:00,378 - INFO - Information Retrieval Evaluation of the model on the  dataset in epoch 0.9433962264150944 after 50 steps:
2025-05-12 14:43:04,976 - INFO - Queries: 22
2025-05-12 14:43:04,979 - INFO - Corpus: 12

2025-05-12 14:43:04,986 - INFO - Score-Function: cosine
2025-05-12 14:43:04,986 - INFO - Accuracy@1: 54.55%
2025-05-12 14:43:04,986 - INFO - Accuracy@3: 81.82%
2025-05-12 14:43:04,987 - INFO - Accuracy@5: 90.91%
2025-05-12 14:43:04,987 - INFO - Accuracy@10: 100.00%
2025-05-12 14:43:04,987 - INFO - Precision@1: 54.55%
2025-05-12 14:43:04,987 - INFO - Precision@3: 27.27%
2025-05-12 14:43:04,987 - INFO - Precision@5: 18.18%
2025-05-12 14:43:04,987 - INFO - Precision@10: 10.00%
2025-05-12 14:43:04,988 - INFO - Recall@1: 54.55%
2025-05-12 14:43:04,988 - INFO - Recall@3: 81.82%
2025-05-12 14:43:04,988 - INFO - Recall@5: 90.91%
2025-05-12 14:43:04,988 - INFO - Recall@10: 100.00%
2025-05-12 14:43:04,988 - INFO - MRR@10: 0.7023
2025-05-12 14:43:04,988 - INFO - NDCG@10: 

In [8]:
# 6. Push to Hugging Face (optional)

print("Pushing model to Hugging Face...")

login(token=hf_token)
    
model_name = push_to_huggingface(model, HF_USERNAME)
print(f"Use this model name in your embedding_models.py: {model_name}")

Pushing model to Hugging Face...


2025-05-12 14:53:10,841 - INFO - Save model to /var/folders/8h/kl800c1j6hjc9xt9lm_1bhph0000gn/T/tmpkkhpgs1t
model.safetensors: 100%|██████████| 1.34G/1.34G [05:45<00:00, 3.87MB/s] 


Model pushed to Hugging Face Hub: JTh34/puppy-embed-8985966a
Use this model name in your embedding_models.py: JTh34/puppy-embed-8985966a
