In [None]:
#@title Téléchargement des oeuvres de Flaubert

import urllib.request
import os

# Charger les oeuvres de Flaubert
flaubert = {
    "Madame_Bovary": "https://www.gutenberg.org/cache/epub/14155/pg14155.txt",
    "L_Education_sentimentale": "https://www.gutenberg.org/cache/epub/49773/pg49773.txt",
    "Bouvard_et_Pecuchet": "https://www.gutenberg.org/cache/epub/14157/pg14157.txt",
    "Salammbô": "https://www.gutenberg.org/cache/epub/48881/pg48881.txt",
    "Trois_Contes": "https://www.gutenberg.org/cache/epub/12065/pg12065.txt",
    "Un_Coeur_simple": "https://www.gutenberg.org/cache/epub/26812/pg26812.txt",
    "Dictionnaire_des_idees_recues": "https://www.gutenberg.org/cache/epub/14156/pg14156.txt"
}

os.makedirs('/content/romans_Flaubert', exist_ok=True)

# Boucle sur chaque ouvrage
for titre, url in flaubert.items():
    # Construire le chemin complet : dossier/titre_du_livre.txt
    chemin_fichier = os.path.join('/content/romans_Flaubert', f"{titre}.txt")

    print(f"Téléchargement de '{titre}'...")

    urllib.request.urlretrieve(url, chemin_fichier)

print(f"\nTous les ouvrages ont été téléchargés.")

Téléchargement de 'Madame_Bovary'...
Téléchargement de 'L_Education_sentimentale'...
Téléchargement de 'Bouvard_et_Pecuchet'...
Téléchargement de 'Salammbô'...
Téléchargement de 'Trois_Contes'...
Téléchargement de 'Un_Coeur_simple'...
Téléchargement de 'Dictionnaire_des_idees_recues'...

Tous les ouvrages ont été téléchargés.


In [None]:
#@title Méthode de découpage basée sur les normes des fichiers issus du Projet Gutenberg

from pathlib import Path
import re, os, tempfile, unicodedata

START_PATTERNS = [
    re.compile(r'(?i)(?:\*{1,}\s*)?(start|begin)(?:\s+of)?(?:\s+the)?\s+project\s+gutenberg'),
    re.compile(r'(?i)start\s+of\s+this\s+project\s+gutenberg'),
    re.compile(r'(?i)project\s+gutenberg.*start'),
]
END_PATTERNS = [
    re.compile(r'(?i)(?:\*{1,}\s*)?(end|finish|finis?h|stop)(?:\s+of)?(?:\s+the)?\s+project\s+gutenberg'),
    re.compile(r'(?i)end\s+of\s+this\s+project\s+gutenberg'),
    re.compile(r'(?i)project\s+gutenberg.*end'),
]

def _norm(s: str) -> str:
    # normalize and unify NBSP etc.
    return unicodedata.normalize('NFKC', s).replace('\u00A0', ' ')

def _find_marker(lines, patterns, reverse=False):
    rng = range(len(lines)-1, -1, -1) if reverse else range(len(lines))
    for i in rng:
        L = _norm(lines[i]).strip()
        for p in patterns:
            if p.search(L):
                return i
    return None

def gutenberg_strip_text(text: str):
    text = _norm(text).replace('\r\n', '\n')
    lines = text.splitlines(keepends=True)
    s_idx = _find_marker(lines, START_PATTERNS, reverse=False)
    e_idx = _find_marker(lines, END_PATTERNS, reverse=True)
    # if neither marker found, return original and markers=None
    if s_idx is None and e_idx is None:
        return text, None, None
    # compute slice: default to start 0 / end len if marker absent
    start = (s_idx + 1) if s_idx is not None else 0
    end = e_idx if e_idx is not None else len(lines)
    # safety: if start >= end, do not strip (return original)
    if start >= end:
        return text, s_idx, e_idx
    body = lines[start:end]
    # strip leading/trailing blank lines
    while body and body[0].strip() == '':
        body.pop(0)
    while body and body[-1].strip() == '':
        body.pop()
    return ''.join(body), s_idx, e_idx

# tokenizer: prefer tiktoken if available, fallback to simple word count
try:
    import tiktoken
    enc = tiktoken.get_encoding("cl100k_base")
    def token_count(s: str) -> int:
        return len(enc.encode(s))
except Exception:
    def token_count(s: str) -> int:
        return 0 if not s.strip() else len(re.findall(r'\S+', s))

# Loop
FOLDER = Path("/content/romans_Flaubert")
if not FOLDER.exists():
    raise SystemExit(f"Folder not found: {FOLDER}")

total_deleted = 0
files = sorted(FOLDER.glob("*.txt"))
if not files:
    print("No .txt files found in", FOLDER)
for path in files:
    txt = path.read_text(encoding="utf-8", errors="replace")
    stripped, s_idx, e_idx = gutenberg_strip_text(txt)
    if s_idx is None and e_idx is None:
        print(f"{path.name}: left unchanged")
        continue
    orig_tokens = token_count(txt)
    new_tokens = token_count(stripped)
    deleted = max(0, orig_tokens - new_tokens)
    # atomic overwrite in same dir
    with tempfile.NamedTemporaryFile("w", delete=False, encoding="utf-8", dir=path.parent) as tmp:
        tmp.write(stripped)
    os.replace(tmp.name, path)
    total_deleted += deleted
    print(f"{path.name}: start_line={s_idx} end_line={e_idx} tokens_deleted={deleted} (orig {orig_tokens} → {new_tokens})")

print(f"\nProcessed {len(files)} files. Total tokens deleted: {total_deleted}")

Bouvard_et_Pecuchet.txt: start_line=22 end_line=12968 tokens_deleted=716 (orig 176891 → 176175)
Dictionnaire_des_idees_recues.txt: start_line=22 end_line=2931 tokens_deleted=721 (orig 28750 → 28029)
L_Education_sentimentale.txt: start_line=25 end_line=9690 tokens_deleted=812 (orig 135147 → 134335)
Madame_Bovary.txt: start_line=22 end_line=14832 tokens_deleted=705 (orig 219615 → 218910)
Salammbô.txt: start_line=25 end_line=13529 tokens_deleted=791 (orig 220295 → 219504)
Trois_Contes.txt: start_line=25 end_line=4158 tokens_deleted=747 (orig 63576 → 62829)
Un_Coeur_simple.txt: start_line=22 end_line=1968 tokens_deleted=691 (orig 25973 → 25282)

Processed 7 files. Total tokens deleted: 5183


In [None]:
#@title camembert-ner sur Madame Bovary

!pip install -q transformers[torch] datasets sentencepiece nltk
!pip install -q --upgrade accelerate

from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import json, csv, os, math
from pathlib import Path
from tqdm.auto import tqdm

MODEL_NAME = "Jean-Baptiste/camembert-ner"   # recommended for French (change if you prefer)
TXT_PATH = "/content/romans_Flaubert/Madame_Bovary.txt"      # path to your raw txt file
OUTPUT_DIR = "/content/ner_bovary"
os.makedirs(OUTPUT_DIR, exist_ok=True)
DEVICE = 0   # Colab GPU device index (0 for the assigned GPU)

# 4) Load model/tokenizer and HF NER pipeline (on GPU)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
# Create pipeline - use aggregation if available (returns grouped entities)
try:
    nlp = pipeline("ner", model=model, tokenizer=tokenizer,
                   device=DEVICE, aggregation_strategy="simple")
except TypeError:
    # fallback for older transformers versions
    nlp = pipeline("ner", model=model, tokenizer=tokenizer,
                   device=DEVICE, grouped_entities=True)

# 5) Read the book
with open(TXT_PATH, "r", encoding="utf-8") as f:
    text = f.read()

import nltk
nltk.download('punkt')
nltk.download('punkt_tab')  # <-- add this line
from nltk.tokenize import sent_tokenize

# Then re-run your sentence splitting
sentences = sent_tokenize(text, language='french')
print(f"Total sentences: {len(sentences)}")

# 7) Build chunks (accumulate sentences until token length near model max)
max_tokens = tokenizer.model_max_length
# keep a margin to avoid truncation:
MARGIN = 32
chunks = []
chunk_start_indices = []  # start char index of each chunk in original text

cursor = 0  # position in original text for searching next sentence
current_chunk = ""
current_chunk_start = None

for sent in sentences:
    # find next occurrence of the sentence starting from cursor to compute true char offsets
    found = text.find(sent, cursor)
    if found == -1:
        # fallback: use cursor as start (rare)
        found = cursor
    cursor = found + len(sent)

    if current_chunk == "":
        current_chunk_start = found

    # estimate token length by tokenizer
    tmp = current_chunk + " " + sent if current_chunk else sent
    token_len = len(tokenizer(tmp, return_tensors="pt")["input_ids"][0])
    if token_len + MARGIN < max_tokens:
        current_chunk = tmp
    else:
        # flush existing chunk
        chunks.append(current_chunk)
        chunk_start_indices.append(current_chunk_start)
        # start new chunk with current sentence
        current_chunk = sent
        current_chunk_start = found

# flush last
if current_chunk:
    chunks.append(current_chunk)
    chunk_start_indices.append(current_chunk_start)

print(f"Total chunks built: {len(chunks)} (model max tokens: {max_tokens})")

# 8) Run NER over chunks (batched) and collect absolute offsets
all_entities = []
for chunk_text, chunk_start in tqdm(list(zip(chunks, chunk_start_indices)), desc="NER chunks"):
    results = nlp(chunk_text)
    # results: list of dicts, each with keys like 'entity_group' or 'entity' (depends on HF version),
    # and 'start','end','score','word'
    for ent in results:
        # handle aggregated vs non-aggregated output differences
        if "entity_group" in ent:
            ent_type = ent["entity_group"]
        else:
            # older output may use 'entity' like 'B-PER' -> normalize
            ent_type = ent.get("entity", ent.get("label"))
        start = ent.get("start")
        end = ent.get("end")
        if start is None or end is None:
            # if pipeline didn't return character offsets, skip (rare with fast tokenizers & aggregation)
            continue
        abs_start = chunk_start + start
        abs_end = chunk_start + end
        snippet = text[max(0, abs_start-40):min(len(text), abs_end+40)]
        all_entities.append({
            "text": ent.get("word", text[abs_start:abs_end]),
            "type": ent_type,
            "start": abs_start,
            "end": abs_end,
            "score": float(ent.get("score", 0.0)),
            "context": snippet
        })

print(f"Raw extracted entities: {len(all_entities)}")

# 9) Simple post-processing: deduplicate & merge identical spans
# Convert to a dict keyed by (start,end,type,text) to deduplicate
unique = {}
for e in all_entities:
    key = (e["start"], e["end"], e["type"], e["text"])
    # keep highest-score instance
    if key not in unique or unique[key]["score"] < e["score"]:
        unique[key] = e

entities_dedup = list(unique.values())
entities_dedup = sorted(entities_dedup, key=lambda x: (x["start"], -x["score"]))

print(f"Entities after deduplication: {len(entities_dedup)}")

# 10) Save JSON and CSV
json_path = os.path.join(OUTPUT_DIR, "madame_bovary_ner.json")
csv_path = os.path.join(OUTPUT_DIR, "madame_bovary_ner.csv")
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(entities_dedup, f, ensure_ascii=False, indent=2)

with open(csv_path, "w", encoding="utf-8", newline='') as f:
    writer = csv.DictWriter(f, fieldnames=["text","type","start","end","score","context"])
    writer.writeheader()
    for e in entities_dedup:
        writer.writerow(e)

print("Saved:", json_path, csv_path)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Device set to use cuda:0
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
Token indices sequence length is longer than the specified maximum sequence length for this model (515 > 512). Running this sequence through the model will result in indexing errors


Total sentences: 6906
Total chunks built: 393 (model max tokens: 512)


NER chunks:   0%|          | 0/393 [00:00<?, ?it/s]

Raw extracted entities: 3251
Entities after deduplication: 3251
Saved: /content/ner_bovary/madame_bovary_ner.json /content/ner_bovary/madame_bovary_ner.csv


In [None]:
# === PERSON EXTRACTION TOOL ===

import os
import json
import csv
from collections import defaultdict, Counter
import pandas as pd
import re
from difflib import SequenceMatcher, get_close_matches

# Optional: fuzzy merging
# !pip install -q rapidfuzz
# from rapidfuzz import process, fuzz

# ---------- USER SETTINGS ----------
INPUT_PATH = "/content/ner_results/madame_bovary_ner.json"  # change to your CSV or JSON
OUTPUT_DIR = "/content/persons_output"
FUZZY_MERGE = True     # try to merge similar names (True = attempt fuzzy merging)
FUZZY_SIMILARITY_THRESHOLD = 0.87  # 0..1 for difflib; if using rapidfuzz this would be 87
# -----------------------------------

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ---------- Helpers ----------
def load_input(path):
    path = str(path)
    if path.lower().endswith(".json"):
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        # Expect either a list of entity dicts or nested structure; try to flatten common shapes
        if isinstance(data, dict) and "entities" in data and isinstance(data["entities"], list):
            return data["entities"]
        if isinstance(data, list):
            return data
        raise ValueError("JSON structure not recognized: expected list of entity dicts or {'entities': [...]}")

    elif path.lower().endswith(".csv"):
        df = pd.read_csv(path, dtype=str).fillna("")
        # convert to list of dicts
        return df.to_dict(orient="records")
    else:
        raise ValueError("Unsupported input format: must be .json or .csv")

def is_person_label(label_value):
    """Return True if the label value indicates a person name."""
    if not label_value:
        return False
    v = str(label_value).upper()
    # Common patterns: "PER", "PERSON", "NER" (user said 'NER'), B-PER/I-PER, "PERSONNE" (French)
    candidates = ["PER", "PERSON", "PERSONNE", "NER", "PERS"]
    # direct contains
    for c in candidates:
        if c in v:
            return True
    # case: "B-PER", "I-PER", "B-PERSON"
    if re.search(r"\bB[-_]?PER\b|\bI[-_]?PER\b", v):
        return True
    return False

def normalize_name(name):
    """Lower-level normalization: strip whitespace, unify spaces, strip punctuation edges."""
    if name is None:
        return ""
    s = str(name)
    s = s.strip()
    # Replace weird tokenization artifacts (e.g., '##' subword markers or '▁' from SentencePiece)
    s = s.replace("##", "").replace("▁", " ")
    # collapse whitespace
    s = re.sub(r"\s+", " ", s)
    # remove surrounding punctuation like quotes or parentheses
    s = s.strip(" \"'«»(),;:.")
    return s

def similar(a, b):
    """Similarity metric 0..1 using SequenceMatcher"""
    if not a or not b:
        return 0.0
    return SequenceMatcher(None, a, b).ratio()

# ---------- Load ----------
records = load_input(INPUT_PATH)
print(f"Loaded {len(records)} records from {INPUT_PATH}")

# ---------- Extract person-name occurrences ----------
occurrences = []  # list of dicts {name, label_key, score, start, end, raw_text}
for rec in records:
    # possible name fields
    name = None
    # Common keys from HF pipeline: 'word', 'entity', 'label', 'entity_group', 'type', 'text'
    for k in ("word", "text", "entity", "label", "entity_group", "type"):
        if k in rec and rec[k] not in (None, ""):
            # some pipelines produce multi-token grouped entity as 'word', prefer that
            name = rec[k]
            label_key = rec.get("entity_group") or rec.get("label") or rec.get("type") or rec.get("entity") or ""
            break

    if name is None:
        # try to reconstruct from character offsets if present
        if "start" in rec and "end" in rec and "context" in rec:
            # as fallback, try to extract substring from context if rec doesn't include exact text
            name = rec.get("word") or rec.get("text") or ""
        else:
            continue

    name_norm = normalize_name(name)
    # find label value if present under various keys
    label_value = rec.get("entity_group") or rec.get("type") or rec.get("label") or rec.get("entity") or ""
    if is_person_label(label_value):
        # parse score if present
        try:
            score = float(rec.get("score", rec.get("probability", 0.0) or 0.0))
        except Exception:
            score = 0.0
        occurrence = {
            "raw": name,
            "name": name_norm,
            "label": label_value,
            "score": score,
            "start": rec.get("start"),
            "end": rec.get("end"),
            "context": rec.get("context", "")
        }
        if occurrence["name"]:  # ignore empty
            occurrences.append(occurrence)

print(f"Detected {len(occurrences)} person occurrences (pre-dedup).")

# ---------- Aggregate exact names ----------
counts = defaultdict(int)
scores_sum = defaultdict(float)
occ_list = defaultdict(list)
for occ in occurrences:
    k = occ["name"]
    counts[k] += 1
    scores_sum[k] += occ["score"]
    occ_list[k].append(occ)

# Build initial unique names list
unique_names = sorted(counts.keys(), key=lambda x: (-counts[x], x))
print(f"{len(unique_names)} unique person strings found before fuzzy merging.")

# ---------- Optional fuzzy merging ----------
if FUZZY_MERGE and len(unique_names) > 0:
    merged_map = {}   # maps original name -> canonical name
    canonicals = []   # list of canonical names
    for name in unique_names:
        # try to find an existing canonical that is similar enough
        found = None
        for c in canonicals:
            if similar(name.lower(), c.lower()) >= FUZZY_SIMILARITY_THRESHOLD:
                found = c
                break
        if found is None:
            # no close match: create new canonical
            canonicals.append(name)
            merged_map[name] = name
        else:
            merged_map[name] = found

    # If you prefer rapidfuzz, you could do a higher-quality matching:
    # use rapidfuzz.process.extractOne(name, canonicals, scorer=fuzz.WRatio)
    # but avoid requiring extra install by default.

    # Build merged aggregates
    merged_counts = defaultdict(int)
    merged_scores_sum = defaultdict(float)
    merged_occ_list = defaultdict(list)
    for orig_name, cnt in counts.items():
        canon = merged_map.get(orig_name, orig_name)
        merged_counts[canon] += cnt
        merged_scores_sum[canon] += scores_sum[orig_name]
        merged_occ_list[canon].extend(occ_list[orig_name])

    # overwrite
    counts = merged_counts
    scores_sum = merged_scores_sum
    occ_list = merged_occ_list

print(f"{len(counts)} names after merging.")

# ---------- Prepare outputs ----------
rows = []
for name, cnt in sorted(counts.items(), key=lambda x: (-x[1], x[0])):
    mean_score = scores_sum[name] / cnt if cnt else 0.0
    example_context = occ_list[name][0]["context"] if occ_list[name] else ""
    rows.append({
        "name": name,
        "count": cnt,
        "mean_score": mean_score,
        "example_context": example_context,
        "occurrences": occ_list[name]
    })

# Write persons_unique.txt
txt_path = os.path.join(OUTPUT_DIR, "persons_unique.txt")
with open(txt_path, "w", encoding="utf-8") as f:
    for r in rows:
        f.write(f"{r['name']}\n")

# Write persons_counts.csv
csv_path = os.path.join(OUTPUT_DIR, "persons_counts.csv")
with open(csv_path, "w", encoding="utf-8", newline='') as f:
    writer = csv.DictWriter(f, fieldnames=["name","count","mean_score","example_context"])
    writer.writeheader()
    for r in rows:
        writer.writerow({
            "name": r["name"],
            "count": r["count"],
            "mean_score": r["mean_score"],
            "example_context": r["example_context"]
        })

# Write persons_normalized.json (full details)
json_path = os.path.join(OUTPUT_DIR, "persons_normalized.json")
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(rows, f, ensure_ascii=False, indent=2)

print("Wrote outputs to:", OUTPUT_DIR)
print("Top 20 persons (by frequency):")
for r in rows[:20]:
    print(f"  {r['name']} — {r['count']} occurrences, mean score {r['mean_score']:.3f}")

try:
    import pandas as pd
    display_df = pd.DataFrame([{"name": r["name"], "count": r["count"], "mean_score": r["mean_score"]} for r in rows])
    from IPython.display import display
    display(display_df.head(30))
except Exception:
    pass


Loaded 3279 records from /content/ner_results/madame_bovary_ner.json
Detected 1929 person occurrences (pre-dedup).
366 unique person strings found before fuzzy merging.
338 names after merging.
Wrote outputs to: /content/persons_output
Top 20 persons (by frequency):
  Charles — 300 occurrences, mean score 0.833
  Emma — 199 occurrences, mean score 0.708
  Léon — 118 occurrences, mean score 0.797
  Homais — 104 occurrences, mean score 0.988
  Rodolphe — 96 occurrences, mean score 0.791
  Madame — 48 occurrences, mean score 0.813
  Lheureux — 42 occurrences, mean score 0.991
  M. Homais — 40 occurrences, mean score 0.923
  Justin — 36 occurrences, mean score 0.911
  Binet — 28 occurrences, mean score 0.995
  Félicité — 26 occurrences, mean score 0.775
  Hippolyte — 24 occurrences, mean score 0.858
  M. Lheureux — 24 occurrences, mean score 0.955
  madame Homais — 22 occurrences, mean score 0.891
  Monsieur — 21 occurrences, mean score 0.830
  Rouault — 16 occurrences, mean score 0.969
  

Unnamed: 0,name,count,mean_score
0,Charles,300,0.833176
1,Emma,199,0.707597
2,Léon,118,0.796928
3,Homais,104,0.988069
4,Rodolphe,96,0.790957
5,Madame,48,0.813232
6,Lheureux,42,0.991086
7,M. Homais,40,0.922977
8,Justin,36,0.91062
9,Binet,28,0.995274
