In [4]:
!wget https://raw.githubusercontent.com/GwenTsang/Flaubert_FlauBERT/main/romans_Flaubert/Bouvard_et_Pecuchet.txt

--2025-10-17 09:14:20--  https://raw.githubusercontent.com/GwenTsang/Flaubert_FlauBERT/main/romans_Flaubert/Bouvard_et_Pecuchet.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 561804 (549K) [text/plain]
Saving to: ‘Bouvard_et_Pecuchet.txt.1’


2025-10-17 09:14:20 (14.1 MB/s) - ‘Bouvard_et_Pecuchet.txt.1’ saved [561804/561804]



In [7]:
#@title Méthode sophistiquée pour l'extraction des chapitres

import re
from collections import OrderedDict
from typing import List, Optional, Dict, Iterable
import polars as pl

ROMAN_RE = re.compile(
    r'^(?=.)M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\.?$',
    re.IGNORECASE
)
ROMAN_TOKEN_RE = re.compile(r'\b([IVXLCDM]+)\b\.?', re.IGNORECASE)
CHAP_RE = re.compile(r'^\s*(CHAPITRE|CHAP|CHAPTER)\b\.?\s*(.*)$', re.IGNORECASE)


def _norm_roman(tok: str) -> str:
    return tok.rstrip('.').upper()


def scan_headings_polars(path: str, prefer_chapitre: bool = True, allow_roman: bool = True, one_based: bool = True) -> pl.DataFrame:
    """
    Parcourt le fichier ligne par ligne et retourne un pl.DataFrame avec :
      - 'element' : chaîne (ex. "I" ou "CHAPITRE I")
      - 'line_indices' : liste d'entiers (indices de lignes, 1-based par défaut)
    """
    with open(path, encoding='utf-8') as f:
        lines = f.read().splitlines()

    n = len(lines)
    used = set()
    headings = OrderedDict()

    def add(key: str, idxs: List[int]):
        if one_based:
            idxs = [i + 1 for i in idxs]
        headings.setdefault(key, []).extend(idxs)

    i = 0
    while i < n:
        text = lines[i].strip()
        if not text:
            i += 1
            continue

        if prefer_chapitre:
            m = CHAP_RE.match(text)
            if m:
                rest = m.group(2).strip()
                consumed = [i]
                roman = None
                if rest:
                    t = ROMAN_TOKEN_RE.search(rest)
                    if t and ROMAN_RE.fullmatch(t.group(1)):
                        roman = _norm_roman(t.group(1))
                if not roman:
                    j = i + 1
                    while j < n and not lines[j].strip():
                        j += 1
                    if j < n and allow_roman and ROMAN_RE.match(lines[j].strip()):
                        roman = _norm_roman(lines[j].strip())
                        consumed.append(j)
                        used.add(j)
                key = f"CHAPITRE {roman}" if roman else lines[i].strip()
                add(key, consumed)
                used.add(i)
                i += 1
                continue

        if allow_roman and i not in used and ROMAN_RE.match(text):
            roman = _norm_roman(text)
            add(roman, [i])
            used.add(i)
            i += 1
            continue

        i += 1

    elements = list(headings.keys())
    indices = [headings[k] for k in elements]
    df = pl.DataFrame({"element": elements, "line_indices": indices})
    return df

def _roman_to_int(s: str) -> int:
    s = _norm_roman(s)
    vals = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
    total = 0
    prev = 0
    for ch in reversed(s):
        v = vals[ch]
        if v < prev:
            total -= v
        else:
            total += v
            prev = v
    return total

def _int_to_roman(n: int) -> str:
    numerals = [
        (1000, 'M'), (900, 'CM'), (500, 'D'), (400, 'CD'),
        (100, 'C'), (90, 'XC'), (50, 'L'), (40, 'XL'),
        (10, 'X'), (9, 'IX'), (5, 'V'), (4, 'IV'), (1, 'I'),
    ]
    res = []
    for val, sym in numerals:
        while n >= val:
            res.append(sym)
            n -= val
    return ''.join(res)

def _extract_roman_from_element(element: str) -> Optional[str]:
    """
    Returns a normalized Roman numeral token from an 'element' value
    like 'CHAPITRE XII' or 'III'. None if no valid Roman token is found.
    """
    s = element.strip()
    if ROMAN_RE.fullmatch(s):
        return _norm_roman(s)
    m = ROMAN_TOKEN_RE.search(s)
    if m:
        tok = _norm_roman(m.group(1))
        if ROMAN_RE.fullmatch(tok):
            return tok
    return None

def _build_candidates_by_roman(df: pl.DataFrame) -> Dict[str, List[int]]:
    by_roman: Dict[str, List[int]] = {}
    for element, idxs in df.iter_rows():
        roman = _extract_roman_from_element(element)
        if not roman:
            continue
        if not isinstance(idxs, list) or not idxs:
            continue
        by_roman.setdefault(roman, []).extend(int(x) for x in idxs)
    for r, lst in by_roman.items():
        by_roman[r] = sorted(set(lst))
    return by_roman

_LETTER_TOKEN_RE = re.compile(r"[^\W\d_]+", flags=re.UNICODE)

def _has_all_caps_word(line: str, min_len: int = 2) -> bool:
    """
    True if the line contains a word of only letters, length >= min_len, and all letters uppercase.
    Examples that count: 'PREFACE', 'TABLE', 'III' (roman in caps), 'PRÉFACE'
    Examples that do NOT count: 'John', 'I' (single letter), 'Title-Case' (split as 'Title', 'Case' -> both not isupper)
    """
    for tok in _LETTER_TOKEN_RE.findall(line):
        if len(tok) >= min_len and tok.isupper():
            return True
    return False

def _invalidates_first_I(lines: List[str], i_one_based: int, window: Iterable[int] = (1, 2, 3), require_full_window: bool = True) -> bool:
    """
    Returns True if candidate index i for chapter I should be rejected:
      - any of lines i+1, i+2, i+3 has an ALL-CAPS word (length >= 2), OR
      - require_full_window=True and any of those lines do not exist.
    """
    n = len(lines)
    i0 = i_one_based - 1
    if require_full_window and any(i0 + off >= n for off in window):
        return True
    for off in window:
        j = i0 + off
        if 0 <= j < n:
            if _has_all_caps_word(lines[j].lstrip(), min_len=2):
                return True
    return False

def select_monotonic_chapters(
    df: pl.DataFrame,
    target_max: int = 26,
    require_complete: bool = False,
    keep_candidates: bool = False,
    min_lines: int = 8,
    enforce_last_min: bool = False,
    total_lines: Optional[int] = None,
    lines: Optional[List[str]] = None,
    require_full_window_for_I: bool = True,
) -> pl.DataFrame:
    """
    Pick one line index per chapter (I..target_max) so that:
      - indices strictly increase
      - each chapter start is at least `min_lines` after the previous start
      - special rule for chapter I:
          reject a candidate i if any of i+1, i+2, i+3 has an ALL-CAPS word (len >= 2),
          or (optionally) if those lookahead lines are missing.

    If require_complete=True, raises if a chapter cannot be placed; otherwise it is skipped.
    Set keep_candidates=True to include the deduped candidates per chapter.

    Pass `lines` as the raw file lines (list of str) to enable the chapter I rule.
    """
    by_roman = _build_candidates_by_roman(df)
    order = [_int_to_roman(i) for i in range(1, target_max + 1)]
    out_rows = []
    prev = 0

    for r in order:
        candidates = by_roman.get(r, [])
        threshold = 1 if prev == 0 else prev + min_lines

        chosen = None
        if r == "I" and lines is not None:
            for x in candidates:
                if x >= threshold and not _invalidates_first_I(lines, x, require_full_window=require_full_window_for_I):
                    chosen = x
                    break
        else:
            chosen = next((x for x in candidates if x >= threshold), None)

        if chosen is None:
            if require_complete:
                rule = " with I-rule" if (r == "I" and lines is not None) else ""
                raise ValueError(f"No feasible index for chapter {r}{rule}. Threshold={threshold}. Candidates={candidates}")
            continue

        row = {"element": r, "line": chosen}
        if keep_candidates:
            row["candidates"] = candidates
        out_rows.append(row)
        prev = chosen

    if enforce_last_min and total_lines is not None:
        while out_rows:
            last_line = out_rows[-1]["line"]
            if (total_lines - last_line + 1) >= min_lines:
                break
            if require_complete:
                raise ValueError(
                    f"Last chapter {out_rows[-1]['element']} at line {last_line} "
                    f"has fewer than {min_lines} lines until EOF ({total_lines})."
                )
            out_rows.pop()

    return pl.from_dicts(out_rows)

In [8]:
#@title Application des fonctions et constitution d'un dictionnaire

from glob import glob
import os
import polars as pl


folder = "/content"


def load_chapters(path, start_lines):
    """Extract chapters from a text file based on start line indices."""
    with open(path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    n_lines = len(lines)
    # Verification and conversion to 0-based indices
    starts = []
    for roman, lineno in start_lines.items():
        if lineno < 1 or lineno > n_lines:
            raise ValueError(f"Invalid start index for chapter {roman}: line {lineno} (file has {n_lines} lines)")
        starts.append((roman, lineno - 1))

    starts.sort(key=lambda x: x[1])

    # Build the dictionary { 'I': 'text of chapter I', ... }
    chapters = {}
    for i, (roman, start_idx) in enumerate(starts):
        end_idx = starts[i+1][1] - 1 if i < len(starts) - 1 else n_lines - 1
        chap_text = ''.join(lines[start_idx:end_idx+1]).rstrip('\n')
        chapters[roman] = chap_text

    return chapters


def build_starts_index(folder: str, target_max: int = 26, min_lines: int = 8):
    """Build an index of chapter starts from all text files in a folder."""
    start_rows = []
    starts_map = {}

    for path in sorted(glob(f"{folder}/*.txt")):
        with open(path, encoding="utf-8") as f:
            lines = f.read().splitlines()

        df_scan = scan_headings_polars(path)
        selected = select_monotonic_chapters(
            df_scan,
            target_max=target_max,
            min_lines=min_lines,
            lines=lines,
        )

        if selected.height == 0:
            continue

        fname = os.path.basename(path)
        for roman, line in selected.select(["element", "line"]).iter_rows():
            start_rows.append({
                "file": fname,
                "path": path,
                "chapter": roman,
                "start_line": int(line),
            })
            starts_map.setdefault(path, {})[roman] = int(line)

    starts_df = (
        pl.from_dicts(start_rows).sort(["file", "start_line"])
        if start_rows else
        pl.DataFrame({"file": [], "path": [], "chapter": [], "start_line": []})
    )
    return starts_df, starts_map


# Build chapter start index
starts_df, starts_map = build_starts_index(folder, target_max=26, min_lines=8)
print("Chapter start index DataFrame:")
print(starts_df)

# Load chapters for each book
book_chapters = {}
for path, start_lines in starts_map.items():
    fname = os.path.basename(path)
    chapters = load_chapters(path, start_lines)
    book_chapters[fname] = chapters

Chapter start index DataFrame:
shape: (10, 4)
┌─────────────────────────┬─────────────────────────────────┬─────────┬────────────┐
│ file                    ┆ path                            ┆ chapter ┆ start_line │
│ ---                     ┆ ---                             ┆ ---     ┆ ---        │
│ str                     ┆ str                             ┆ str     ┆ i64        │
╞═════════════════════════╪═════════════════════════════════╪═════════╪════════════╡
│ Bouvard_et_Pecuchet.txt ┆ /content/Bouvard_et_Pecuchet.t… ┆ I       ┆ 19         │
│ Bouvard_et_Pecuchet.txt ┆ /content/Bouvard_et_Pecuchet.t… ┆ II      ┆ 331        │
│ Bouvard_et_Pecuchet.txt ┆ /content/Bouvard_et_Pecuchet.t… ┆ III     ┆ 939        │
│ Bouvard_et_Pecuchet.txt ┆ /content/Bouvard_et_Pecuchet.t… ┆ IV      ┆ 1865       │
│ Bouvard_et_Pecuchet.txt ┆ /content/Bouvard_et_Pecuchet.t… ┆ V       ┆ 2643       │
│ Bouvard_et_Pecuchet.txt ┆ /content/Bouvard_et_Pecuchet.t… ┆ VI      ┆ 3211       │
│ Bouvard_et_Pecuch

In [16]:
# Define input and output file paths
input_file = "/content/Bouvard_et_Pecuchet.txt"
output_file = "/content/Bouvard_et_Pecuchet_chapitre_1.txt"

# Define the line range to retain
start_line = 22
end_line = 330

# Open the input file and extract the required lines
with open(input_file, "r", encoding="utf-8") as infile:
    lines = infile.readlines()

# Select only lines 19 to 331 (Python uses 0-based indexing)
selected_lines = lines[start_line - 1:end_line]

# Write the selected lines to a new file
with open(output_file, "w", encoding="utf-8") as outfile:
    outfile.writelines(selected_lines)

print(f"Lines {start_line} to {end_line} have been saved to {output_file}")

Lines 22 to 330 have been saved to /content/Bouvard_et_Pecuchet_chapitre_1.txt


In [17]:
import re

file_path = "/content/Bouvard_et_Pecuchet_chapitre_1.txt"

with open(file_path, "r", encoding="utf-8") as f:
    text = f.read()

# This pattern splits at '.', '?', or '!' followed by a space or end of line
sentences = re.split(r'[.!?]+(?:\s|$)', text)

sentences = [s.strip() for s in sentences if s.strip()]

num_sentences = len(sentences)

print(f"Number of sentences in the file: {num_sentences}")

Number of sentences in the file: 389


In [None]:
pip install sacremoses torch -q

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("flaubert/flaubert_base_uncased")
model = AutoModelForMaskedLM.from_pretrained("flaubert/flaubert_base_uncased")