## Ingesting Book....

In [2]:
from pathlib import Path
from bs4 import BeautifulSoup
from langchain.schema import Document
import re
import nltk

# Télécharger le tokenizer de phrase si nécessaire
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\noeay\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [34]:


class HTMLLoader:
    """
    Loader HTML avancé : 
    - extrait les phrases des <p>
    - gère pages implicites et ranges
    - regroupe phrases courtes
    """

    # numéros entre parenthèses 1 à 3 chiffres (exclut les années)
    PAGE_PATTERN = re.compile(r'\((\d{1,3})(?!\d)\)')

    def __init__(self, folder_path: str, min_sent_len=50):
        self.folder = Path(folder_path)
        self.min_sent_len = min_sent_len

    def _split_sentences_with_pages(self, text: str, last_page=None):
        sentences = sent_tokenize(text, language='french')
        result = []

        # parcourir toutes les phrases et assigner la page
        for i, sentence in enumerate(sentences):
            pages_in_sentence = self.PAGE_PATTERN.findall(sentence)
            if pages_in_sentence:
                x = int(pages_in_sentence[-1]) + 1 
                page = f"{last_page}-{x}" if last_page else str(x)
                last_page = x
            else:
                page = str(last_page) if last_page else "unknown"

            # si page unknown, chercher la prochaine page dans le reste
            if page == "unknown":
                for future_sentence in sentences[i+1:]:
                    match = self.PAGE_PATTERN.search(future_sentence)
                    if match:
                        page = match.group(1)
                        break
                if page == "unknown":
                    page = str(last_page) if last_page else "unknown"

            result.append((sentence, page))

        return result, last_page

    def _group_short_sentences(self, sentences_with_pages):
        grouped = []
        buffer = []
        buffer_page = None

        for sentence, page in sentences_with_pages:
            if not buffer:
                buffer.append(sentence)
                buffer_page = page
            elif len(sentence) < self.min_sent_len and page == buffer_page:
                buffer.append(sentence)
            else:
                grouped.append((" ".join(buffer), buffer_page))
                buffer = [sentence]
                buffer_page = page
        if buffer:
            grouped.append((" ".join(buffer), buffer_page))
        return grouped

    def load(self):
        documents = []
        for html_file in self.folder.glob("*.html"):
            soup = BeautifulSoup(html_file.read_text(encoding="utf-8"), "html.parser")
            paragraphs = soup.find_all("p")
            last_page = None
            for p in paragraphs:
                text = p.get_text().strip()
                if not text:
                    continue
                sentences_with_pages, last_page = self._split_sentences_with_pages(text, last_page)
                grouped_sentences = self._group_short_sentences(sentences_with_pages)
                for sentence, page in grouped_sentences:
                    documents.append(Document(
                        page_content=sentence,
                        metadata={"book": html_file.name, "page": page}
                    ))
        return documents


In [35]:
loader = HTMLLoader("data")
docs = loader.load()
print(f"{len(docs)} phrases chargées et prêtes pour vectorisation.")

11319 phrases chargées et prêtes pour vectorisation.


In [36]:
docs

[Document(metadata={'book': 'la-connaissance-de-la-vie-de-georges-canguilhem.html', 'page': 'unknown'}, page_content='Georges CANGUILHEM'),
 Document(metadata={'book': 'la-connaissance-de-la-vie-de-georges-canguilhem.html', 'page': 'unknown'}, page_content='La connaissance'),
 Document(metadata={'book': 'la-connaissance-de-la-vie-de-georges-canguilhem.html', 'page': 'unknown'}, page_content='de la vie'),
 Document(metadata={'book': 'la-connaissance-de-la-vie-de-georges-canguilhem.html', 'page': 'unknown'}, page_content='Edition VRIN'),
 Document(metadata={'book': 'la-connaissance-de-la-vie-de-georges-canguilhem.html', 'page': 'unknown'}, page_content='(Les références paginaires sont données dans cette édition)'),
 Document(metadata={'book': 'la-connaissance-de-la-vie-de-georges-canguilhem.html', 'page': 'unknown'}, page_content='Résumé et recueil de citations sur le thème de «\xa0Expériences de la nature\xa0»'),
 Document(metadata={'book': 'la-connaissance-de-la-vie-de-georges-canguilh

## Vector Embeddings

In [41]:
!ollama list
# # Pull sfr-embedding-mistral model from Ollama if you don't have it
#!ollama pull sfr-embedding-mistral

NAME                                ID              SIZE      MODIFIED      
avr/sfr-embedding-mistral:latest    3a707fec6ecc    4.4 GB    4 minutes ago    
qwen3:latest                        500a1f067a9f    5.2 GB    5 weeks ago      
deepseek-r1:latest                  6995872bfe4c    5.2 GB    2 months ago     
qwen2:0.5b                          6f48b936a09f    352 MB    3 months ago     
llama3.2:3b                         a80c4f17acd5    2.0 GB    4 months ago     
llama3.2:latest                     a80c4f17acd5    2.0 GB    6 months ago     
mistral:latest                      f974a74358d6    4.1 GB    6 months ago     
qwen2.5:latest                      845dbda0ea48    4.7 GB    6 months ago     
mxbai-embed-large:latest            468836162de7    669 MB    6 months ago     
deepseek-r1:1.5b                    a42b25d8c10a    1.1 GB    6 months ago     


In [51]:
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import Chroma


model_name = "avr/sfr-embedding-mistral"
vector_db = Chroma(persist_directory= "./book_db",embedding_function= OllamaEmbeddings(model=model_name),collection_name="raw_book")

  vector_db = Chroma(persist_directory= "./book_db",embedding_function= OllamaEmbeddings(model=model_name),collection_name="raw_book")


In [53]:
from tqdm import tqdm

for document in tqdm(docs, desc="Calcul des embeddings"):
    try:
        vector_db.add_documents([document])
    except: 
        print("Error on : ", document)
        
vector_db.persist()

Calcul des embeddings: 100%|██████████| 11319/11319 [15:02<00:00, 12.54it/s]
  vector_db.persist()


In [58]:
import plotly.express as px
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE

# docs = loader.load()  # ta liste de Document avec metadata et page_content
# vector_db = Chroma.from_documents(...) si tu as déjà vectorisé

# Récupérer embeddings et metadata
# Récupérer toutes les vecteurs et metadata depuis la collection
results = vector_db.get(include=["metadatas", "documents", "embeddings"])

# Embeddings
embeddings = [np.array(e, dtype=float) for e in results["embeddings"]]  # liste de listes de floats

# Metadata
metadata = results["metadatas"]     # liste de dicts

# Texte
phrases = results["documents"]      # liste de chaînes


In [59]:

df = pd.DataFrame({
    "embedding": embeddings,
    "book": [m['book'] for m in metadata],
    "page": [m.get('page', "") for m in metadata],
    "phrase": phrases
})

# t-SNE sur embeddings
tsne_result = TSNE(n_components=2, random_state=0).fit_transform(np.array(df['embedding'].to_list()))

df['x'] = tsne_result[:,0]
df['y'] = tsne_result[:,1]

In [60]:


# Plot interactif
fig = px.scatter(
    df,
    x="x",
    y="y",
    color="book",
    hover_data={"phrase": True, "page": True},
    title="Visualisation t-SNE des phrases vectorisées"
)

fig.show()


## Retrieval