In [24]:
!wget https://raw.githubusercontent.com/GwenTsang/Flaubert_FlauBERT/main/romans_Flaubert/Bouvard_et_Pecuchet.txt -q

In [2]:
#@title Méthode sophistiquée pour l'extraction des chapitres

import re
from collections import OrderedDict
from typing import List, Optional, Dict, Iterable
import polars as pl

ROMAN_RE = re.compile(
    r'^(?=.)M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\.?$',
    re.IGNORECASE
)
ROMAN_TOKEN_RE = re.compile(r'\b([IVXLCDM]+)\b\.?', re.IGNORECASE)
CHAP_RE = re.compile(r'^\s*(CHAPITRE|CHAP|CHAPTER)\b\.?\s*(.*)$', re.IGNORECASE)


def _norm_roman(tok: str) -> str:
    return tok.rstrip('.').upper()


def scan_headings_polars(path: str, prefer_chapitre: bool = True, allow_roman: bool = True, one_based: bool = True) -> pl.DataFrame:
    """
    Parcourt le fichier ligne par ligne et retourne un pl.DataFrame avec :
      - 'element' : chaîne (ex. "I" ou "CHAPITRE I")
      - 'line_indices' : liste d'entiers (indices de lignes, 1-based par défaut)
    """
    with open(path, encoding='utf-8') as f:
        lines = f.read().splitlines()

    n = len(lines)
    used = set()
    headings = OrderedDict()

    def add(key: str, idxs: List[int]):
        if one_based:
            idxs = [i + 1 for i in idxs]
        headings.setdefault(key, []).extend(idxs)

    i = 0
    while i < n:
        text = lines[i].strip()
        if not text:
            i += 1
            continue

        if prefer_chapitre:
            m = CHAP_RE.match(text)
            if m:
                rest = m.group(2).strip()
                consumed = [i]
                roman = None
                if rest:
                    t = ROMAN_TOKEN_RE.search(rest)
                    if t and ROMAN_RE.fullmatch(t.group(1)):
                        roman = _norm_roman(t.group(1))
                if not roman:
                    j = i + 1
                    while j < n and not lines[j].strip():
                        j += 1
                    if j < n and allow_roman and ROMAN_RE.match(lines[j].strip()):
                        roman = _norm_roman(lines[j].strip())
                        consumed.append(j)
                        used.add(j)
                key = f"CHAPITRE {roman}" if roman else lines[i].strip()
                add(key, consumed)
                used.add(i)
                i += 1
                continue

        if allow_roman and i not in used and ROMAN_RE.match(text):
            roman = _norm_roman(text)
            add(roman, [i])
            used.add(i)
            i += 1
            continue

        i += 1

    elements = list(headings.keys())
    indices = [headings[k] for k in elements]
    df = pl.DataFrame({"element": elements, "line_indices": indices})
    return df

def _roman_to_int(s: str) -> int:
    s = _norm_roman(s)
    vals = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
    total = 0
    prev = 0
    for ch in reversed(s):
        v = vals[ch]
        if v < prev:
            total -= v
        else:
            total += v
            prev = v
    return total

def _int_to_roman(n: int) -> str:
    numerals = [
        (1000, 'M'), (900, 'CM'), (500, 'D'), (400, 'CD'),
        (100, 'C'), (90, 'XC'), (50, 'L'), (40, 'XL'),
        (10, 'X'), (9, 'IX'), (5, 'V'), (4, 'IV'), (1, 'I'),
    ]
    res = []
    for val, sym in numerals:
        while n >= val:
            res.append(sym)
            n -= val
    return ''.join(res)

def _extract_roman_from_element(element: str) -> Optional[str]:
    """
    Returns a normalized Roman numeral token from an 'element' value
    like 'CHAPITRE XII' or 'III'. None if no valid Roman token is found.
    """
    s = element.strip()
    if ROMAN_RE.fullmatch(s):
        return _norm_roman(s)
    m = ROMAN_TOKEN_RE.search(s)
    if m:
        tok = _norm_roman(m.group(1))
        if ROMAN_RE.fullmatch(tok):
            return tok
    return None

def _build_candidates_by_roman(df: pl.DataFrame) -> Dict[str, List[int]]:
    by_roman: Dict[str, List[int]] = {}
    for element, idxs in df.iter_rows():
        roman = _extract_roman_from_element(element)
        if not roman:
            continue
        if not isinstance(idxs, list) or not idxs:
            continue
        by_roman.setdefault(roman, []).extend(int(x) for x in idxs)
    for r, lst in by_roman.items():
        by_roman[r] = sorted(set(lst))
    return by_roman

_LETTER_TOKEN_RE = re.compile(r"[^\W\d_]+", flags=re.UNICODE)

def _has_all_caps_word(line: str, min_len: int = 2) -> bool:
    """
    True if the line contains a word of only letters, length >= min_len, and all letters uppercase.
    Examples that count: 'PREFACE', 'TABLE', 'III' (roman in caps), 'PRÉFACE'
    Examples that do NOT count: 'John', 'I' (single letter), 'Title-Case' (split as 'Title', 'Case' -> both not isupper)
    """
    for tok in _LETTER_TOKEN_RE.findall(line):
        if len(tok) >= min_len and tok.isupper():
            return True
    return False

def _invalidates_first_I(lines: List[str], i_one_based: int, window: Iterable[int] = (1, 2, 3), require_full_window: bool = True) -> bool:
    """
    Returns True if candidate index i for chapter I should be rejected:
      - any of lines i+1, i+2, i+3 has an ALL-CAPS word (length >= 2), OR
      - require_full_window=True and any of those lines do not exist.
    """
    n = len(lines)
    i0 = i_one_based - 1
    if require_full_window and any(i0 + off >= n for off in window):
        return True
    for off in window:
        j = i0 + off
        if 0 <= j < n:
            if _has_all_caps_word(lines[j].lstrip(), min_len=2):
                return True
    return False

def select_monotonic_chapters(
    df: pl.DataFrame,
    target_max: int = 26,
    require_complete: bool = False,
    keep_candidates: bool = False,
    min_lines: int = 8,
    enforce_last_min: bool = False,
    total_lines: Optional[int] = None,
    lines: Optional[List[str]] = None,
    require_full_window_for_I: bool = True,
) -> pl.DataFrame:
    """
    Pick one line index per chapter (I..target_max) so that:
      - indices strictly increase
      - each chapter start is at least `min_lines` after the previous start
      - special rule for chapter I:
          reject a candidate i if any of i+1, i+2, i+3 has an ALL-CAPS word (len >= 2),
          or (optionally) if those lookahead lines are missing.

    If require_complete=True, raises if a chapter cannot be placed; otherwise it is skipped.
    Set keep_candidates=True to include the deduped candidates per chapter.

    Pass `lines` as the raw file lines (list of str) to enable the chapter I rule.
    """
    by_roman = _build_candidates_by_roman(df)
    order = [_int_to_roman(i) for i in range(1, target_max + 1)]
    out_rows = []
    prev = 0

    for r in order:
        candidates = by_roman.get(r, [])
        threshold = 1 if prev == 0 else prev + min_lines

        chosen = None
        if r == "I" and lines is not None:
            for x in candidates:
                if x >= threshold and not _invalidates_first_I(lines, x, require_full_window=require_full_window_for_I):
                    chosen = x
                    break
        else:
            chosen = next((x for x in candidates if x >= threshold), None)

        if chosen is None:
            if require_complete:
                rule = " with I-rule" if (r == "I" and lines is not None) else ""
                raise ValueError(f"No feasible index for chapter {r}{rule}. Threshold={threshold}. Candidates={candidates}")
            continue

        row = {"element": r, "line": chosen}
        if keep_candidates:
            row["candidates"] = candidates
        out_rows.append(row)
        prev = chosen

    if enforce_last_min and total_lines is not None:
        while out_rows:
            last_line = out_rows[-1]["line"]
            if (total_lines - last_line + 1) >= min_lines:
                break
            if require_complete:
                raise ValueError(
                    f"Last chapter {out_rows[-1]['element']} at line {last_line} "
                    f"has fewer than {min_lines} lines until EOF ({total_lines})."
                )
            out_rows.pop()

    return pl.from_dicts(out_rows)

In [3]:
#@title Application des fonctions et constitution d'un dictionnaire

from glob import glob
import os
import polars as pl


folder = "/content"


def load_chapters(path, start_lines):
    """Extract chapters from a text file based on start line indices."""
    with open(path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    n_lines = len(lines)
    # Verification and conversion to 0-based indices
    starts = []
    for roman, lineno in start_lines.items():
        if lineno < 1 or lineno > n_lines:
            raise ValueError(f"Invalid start index for chapter {roman}: line {lineno} (file has {n_lines} lines)")
        starts.append((roman, lineno - 1))

    starts.sort(key=lambda x: x[1])

    # Build the dictionary { 'I': 'text of chapter I', ... }
    chapters = {}
    for i, (roman, start_idx) in enumerate(starts):
        end_idx = starts[i+1][1] - 1 if i < len(starts) - 1 else n_lines - 1
        chap_text = ''.join(lines[start_idx:end_idx+1]).rstrip('\n')
        chapters[roman] = chap_text

    return chapters


def build_starts_index(folder: str, target_max: int = 26, min_lines: int = 8):
    """Build an index of chapter starts from all text files in a folder."""
    start_rows = []
    starts_map = {}

    for path in sorted(glob(f"{folder}/*.txt")):
        with open(path, encoding="utf-8") as f:
            lines = f.read().splitlines()

        df_scan = scan_headings_polars(path)
        selected = select_monotonic_chapters(
            df_scan,
            target_max=target_max,
            min_lines=min_lines,
            lines=lines,
        )

        if selected.height == 0:
            continue

        fname = os.path.basename(path)
        for roman, line in selected.select(["element", "line"]).iter_rows():
            start_rows.append({
                "file": fname,
                "path": path,
                "chapter": roman,
                "start_line": int(line),
            })
            starts_map.setdefault(path, {})[roman] = int(line)

    starts_df = (
        pl.from_dicts(start_rows).sort(["file", "start_line"])
        if start_rows else
        pl.DataFrame({"file": [], "path": [], "chapter": [], "start_line": []})
    )
    return starts_df, starts_map


# Build chapter start index
starts_df, starts_map = build_starts_index(folder, target_max=26, min_lines=8)
print("Chapter start index DataFrame:")
print(starts_df)

# Load chapters for each book
book_chapters = {}
for path, start_lines in starts_map.items():
    fname = os.path.basename(path)
    chapters = load_chapters(path, start_lines)
    book_chapters[fname] = chapters

Chapter start index DataFrame:
shape: (10, 4)
┌─────────────────────────┬─────────────────────────────────┬─────────┬────────────┐
│ file                    ┆ path                            ┆ chapter ┆ start_line │
│ ---                     ┆ ---                             ┆ ---     ┆ ---        │
│ str                     ┆ str                             ┆ str     ┆ i64        │
╞═════════════════════════╪═════════════════════════════════╪═════════╪════════════╡
│ Bouvard_et_Pecuchet.txt ┆ /content/Bouvard_et_Pecuchet.t… ┆ I       ┆ 19         │
│ Bouvard_et_Pecuchet.txt ┆ /content/Bouvard_et_Pecuchet.t… ┆ II      ┆ 331        │
│ Bouvard_et_Pecuchet.txt ┆ /content/Bouvard_et_Pecuchet.t… ┆ III     ┆ 939        │
│ Bouvard_et_Pecuchet.txt ┆ /content/Bouvard_et_Pecuchet.t… ┆ IV      ┆ 1865       │
│ Bouvard_et_Pecuchet.txt ┆ /content/Bouvard_et_Pecuchet.t… ┆ V       ┆ 2643       │
│ Bouvard_et_Pecuchet.txt ┆ /content/Bouvard_et_Pecuchet.t… ┆ VI      ┆ 3211       │
│ Bouvard_et_Pecuch

In [4]:
input_file = "/content/Bouvard_et_Pecuchet.txt"
output_file = "/content/Bouvard_et_Pecuchet_chapitre_1.txt"

start_line = 22
end_line = 330

# Open the input file and extract the required lines
with open(input_file, "r", encoding="utf-8") as infile:
    lines = infile.readlines()

# Select only lines 19 to 331 (Python uses 0-based indexing)
selected_lines = lines[start_line - 1:end_line]

# Write the selected lines to a new file
with open(output_file, "w", encoding="utf-8") as outfile:
    outfile.writelines(selected_lines)

print(f"Lines {start_line} to {end_line} have been saved to {output_file}")

Lines 22 to 330 have been saved to /content/Bouvard_et_Pecuchet_chapitre_1.txt


In [5]:
import re

file_path = "/content/Bouvard_et_Pecuchet_chapitre_1.txt"

with open(file_path, "r", encoding="utf-8") as f:
    text = f.read()

# This pattern splits at '.', '?', or '!' followed by a space or end of line
sentences = re.split(r'[.!?]+(?:\s|$)', text)

sentences = [s.strip() for s in sentences if s.strip()]

num_sentences = len(sentences)

print(f"Number of sentences in the file: {num_sentences}")

Number of sentences in the file: 389


In [2]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("almanach/camembert-large")
model = AutoModelForMaskedLM.from_pretrained("almanach/camembert-large")

tokenizer_config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/809k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/374 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/456 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.35G [00:00<?, ?B/s]

In [7]:
pip install nltk -q

In [8]:
import nltk
nltk.download("punkt")
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
#@title Segmentation des phrases françaises de Bouvard et Pécuchet + intégrations CamemBERT

import os
import re
import pickle
from typing import List, Dict

import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel

# --- Paths and constants ---
FILE_PATH = "/content/Simone de Beauvoir.txt"
DICT_PATH = "/content/flaubert_sentence_embeddings.pkl"  # sentence -> np.ndarray (float32)
SENTENCES_TXT = "/content/sentences_extracted.txt"       # optional: inspect extracted sentences
EXPECTED_SENT_COUNT = 250
BATCH_SIZE = 32
MODEL_NAME = "almanach/camembert-large"

# --- Load tokenizer and model ---
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval()

# --- Sentence segmentation helpers ---
def download_nltk_punkt():
    try:
        import nltk
        # Try to locate; if missing, download
        try:
            nltk.data.find("tokenizers/punkt")
        except LookupError:
            nltk.download("punkt", quiet=True)
        # Punkt_tab is required in newer NLTK (>=3.9)
        try:
            nltk.data.find("tokenizers/punkt_tab")
        except LookupError:
            nltk.download("punkt_tab", quiet=True)
        return True
    except Exception as e:
        print(f"NLTK download failed or unavailable: {e}")
        return False

def rule_based_split(text: str) -> List[str]:
    # Normalize whitespace and unify ellipsis
    text = text.replace("…", "...")
    text = re.sub(r"\s+", " ", text).strip()

    # Protect common French abbreviations that include a period
    abbr_pattern = re.compile(
        r"\b(?:M|MM|Mme|Mlle|Dr|Pr|Prof|St|Ste|Sr|Sra|cf|etc|env|ex|vol|éd|No|Nº|nº|n°|p|pp|al|fig|art|av|apr|vs)\.",
        flags=re.IGNORECASE,
    )
    text = abbr_pattern.sub(lambda m: m.group(0)[:-1] + "<DOT>", text)

    # Protect decimals: 12.5 -> 12<DEC>5
    text = re.sub(r"(?<=\d)\.(?=\d)", "<DEC>", text)

    # Protect ellipses
    text = text.replace("...", "<ELLIPSIS>")

    # Split on . ? ! (not the protected placeholders), keeping end punctuation with the sentence
    parts = re.split(r"(?<!<ELLIPSIS>)(?<=[.!?])\s+", text)

    # Restore placeholders
    def restore(s: str) -> str:
        s = s.replace("<ELLIPSIS>", "...")
        s = s.replace("<DEC>", ".")
        s = s.replace("<DOT>", ".")
        return s.strip()

    sentences = [restore(p) for p in parts if p and restore(p)]
    return sentences

def extract_sentences(path: str) -> List[str]:
    with open(path, "r", encoding="utf-8") as f:
        text = f.read()

    # First try NLTK (preferred)
    sentences = None
    if download_nltk_punkt():
        try:
            from nltk.tokenize import sent_tokenize
            sentences = [s.strip() for s in sent_tokenize(text, language="french") if s.strip()]
        except Exception as e:
            print(f"NLTK sent_tokenize failed, falling back to rule-based: {e}")

    if sentences is None:
        sentences = rule_based_split(text)

    print(f"Extracted {len(sentences)} sentences.")
    if len(sentences) != EXPECTED_SENT_COUNT:
        print(f"Warning: expected {EXPECTED_SENT_COUNT} sentences, got {len(sentences)}. Proceeding anyway.")

    # Optional: save for inspection
    with open(SENTENCES_TXT, "w", encoding="utf-8") as out:
        for s in sentences:
            out.write(s + "\n")

    # Note about duplicates (dictionary keys will merge duplicates)
    uniq = len(set(sentences))
    if uniq != len(sentences):
        print(f"Note: {len(sentences) - uniq} duplicate sentence(s) detected. "
              f"Since the dictionary uses sentences as keys, duplicates will be merged.")

    return sentences

# --- Mean pooling over last hidden layer ---
def mean_pooling(last_hidden_state: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
    mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    masked_embeddings = last_hidden_state * mask
    sum_embeddings = masked_embeddings.sum(dim=1)
    lengths = mask.sum(dim=1).clamp(min=1e-9)
    return sum_embeddings / lengths  # [batch, hidden]

# --- Embed sentences in batches ---
def embed_sentences(sentences: List[str]) -> Dict[str, np.ndarray]:
    sentence_to_emb: Dict[str, np.ndarray] = {}
    with torch.no_grad():
        for start in range(0, len(sentences), BATCH_SIZE):
            batch = sentences[start:start + BATCH_SIZE]
            enc = tokenizer(
                batch,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=512
            ).to(device)

            outputs = model(**enc)
            last_hidden = outputs.last_hidden_state  # [B, T, H]
            sent_emb = mean_pooling(last_hidden, enc["attention_mask"])  # [B, H]
            sent_emb = sent_emb.detach().cpu().numpy().astype(np.float32)

            for s, e in zip(batch, sent_emb):
                sentence_to_emb[s] = e

            done = min(start + BATCH_SIZE, len(sentences))
            if done % (BATCH_SIZE * 4) == 0 or done == len(sentences):
                print(f"Encoded {done}/{len(sentences)} sentences")
    return sentence_to_emb

# --- Run all steps ---
sentences = extract_sentences(FILE_PATH)
emb_dict = embed_sentences(sentences)

with open(DICT_PATH, "wb") as f:
    pickle.dump(emb_dict, f)

print(f"Saved {len(emb_dict)} sentence embeddings to {DICT_PATH}")
print("Done.")

Some weights of CamembertModel were not initialized from the model checkpoint at almanach/camembert-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Extracted 1095 sentences.
Note: 7 duplicate sentence(s) detected. Since the dictionary uses sentences as keys, duplicates will be merged.
Encoded 128/1095 sentences
Encoded 256/1095 sentences
Encoded 384/1095 sentences
Encoded 512/1095 sentences
Encoded 640/1095 sentences
Encoded 768/1095 sentences
Encoded 896/1095 sentences
Encoded 1024/1095 sentences
Encoded 1095/1095 sentences
Saved 1088 sentence embeddings to /content/flaubert_sentence_embeddings.pkl
Done.


In [7]:
#@title À partir d'une phrase, renvoyer la phrase la plus similaire dans l'ouvrage de Beauvoir

import pickle
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel

DICT_PATH = "/content/flaubert_sentence_embeddings.pkl"
MODEL_NAME = "almanach/camembert-large"

# Load tokenizer/model (same checkpoint)
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval()

# Mean pooling helper (same as in Part 1)
def mean_pooling(last_hidden_state: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
    mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    masked_embeddings = last_hidden_state * mask
    sum_embeddings = masked_embeddings.sum(dim=1)
    lengths = mask.sum(dim=1).clamp(min=1e-9)
    return sum_embeddings / lengths

# Load the saved dictionary: sentence -> embedding (np.ndarray)
with open(DICT_PATH, "rb") as f:
    sentence_to_emb = pickle.load(f)

# Build arrays for vectorized similarity
sentences = list(sentence_to_emb.keys())
emb_matrix = np.stack([sentence_to_emb[s] for s in sentences], axis=0)  # [N, H]
# Pre-normalize for cosine similarity
emb_norms = np.linalg.norm(emb_matrix, axis=1, keepdims=True)
emb_norms[emb_norms == 0] = 1e-12
emb_matrix_normed = emb_matrix / emb_norms  # [N, H]

def embed_sentence(sentence: str) -> np.ndarray:
    with torch.no_grad():
        enc = tokenizer(
            [sentence],
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512
        ).to(device)
        outputs = model(**enc)
        last_hidden = outputs.last_hidden_state
        sent_emb = mean_pooling(last_hidden, enc["attention_mask"])  # [1, H]
        return sent_emb.squeeze(0).detach().cpu().numpy().astype(np.float32)

def most_similar_sentence(query: str) -> str:
    q = embed_sentence(query)
    q_norm = np.linalg.norm(q)
    if q_norm == 0:
        q_norm = 1e-12
    q = q / q_norm
    sims = emb_matrix_normed @ q  # cosine similarity
    best_idx = int(np.argmax(sims))
    return sentences[best_idx]

Some weights of CamembertModel were not initialized from the model checkpoint at almanach/camembert-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
#@title à partir d'une phrase, renvoyer les k phrases les plus similaires dans le txt de Beauvoir

from typing import List, Tuple, Union

def top_k_similar_sentences(query: str, k: int = 5, return_scores: bool = True, print_results: bool = False) -> Union[List[Tuple[str, float]], List[str]]:
    """
    Return the top-k most similar sentences to `query`.
    - If return_scores is True, returns a list of (sentence, score) sorted by descending score.
    - If return_scores is False, returns a list of sentences only.
    - If print_results is True, prints the results (score then sentence).
    """
    q = embed_sentence(query)  # numpy array, shape (H,)
    q_norm = np.linalg.norm(q)
    if q_norm == 0:
        q_norm = 1e-12
    q = q / q_norm

    # cosine similarities (emb_matrix_normed pre-normalized)
    sims = emb_matrix_normed @ q  # shape (N,)

    N = sims.shape[0]
    if k <= 0:
        return [] if not return_scores else []
    k = min(k, N)

    if k == 1:
        best_idx = int(np.argmax(sims))
        result = [(sentences[best_idx], float(sims[best_idx]))]
    else:
        # fast top-k selection, then sort those k by descending score
        topk_idx = np.argpartition(-sims, k-1)[:k]          # unsorted top-k indices
        topk_sorted_idx = topk_idx[np.argsort(-sims[topk_idx])]  # sorted by descending similarity
        result = [(sentences[i], float(sims[i])) for i in topk_sorted_idx]

    if print_results:
        for sent, score in result:
            print(f"{score:.6f}\t{sent}")

    if return_scores:
        return result
    else:
        return [sent for sent, _ in result]

In [9]:
top5 = top_k_similar_sentences("L'existence précède l'essence", k=5, return_scores=True, print_results=True)

0.881214	Il ne saurait s’identifier avec l’infini.
0.877296	L’humanité est une suite discontinue d’hommes libres qu’isole irrémédiablement leur subjectivité.
0.868910	Il n’existe que l’impression du moment.
0.866662	La plénitude de l’être, c’est l’éternité ; cet objet qui s’écroulera un jour n’est pas vraiment.
0.866049	Les hésitations de Heidegger touchant le degré de réalité de l’existence inauthentique ont leur source dans ce sophisme.


In [11]:
top5 = top_k_similar_sentences("Le plaisir et la jouissance sont à rechercher", k=5, return_scores=True, print_results=True)

0.912508	Mais la jouissance est-elle repos ?
0.910454	Il pose l’objectivité des valeurs admises telles que la santé, la richesse, la gloire.
0.907891	La réflexion ne saurait arrêter l’élan de notre spontanéité.
0.907553	Il est des blâmes et des haines que j’assume avec joie : le révolutionnaire qui combat le projet du conservateur souhaite lui apparaître comme une force hostile.
0.907323	Et il ne gagne rien à cette métamorphose.
