In [1]:
!wget https://www.gutenberg.org/cache/epub/14155/pg14155.txt -O Madame_Bovary.txt -q

In [11]:
#@title Méthode de découpage basée sur les normes des fichiers issus du Projet Gutenberg

from pathlib import Path
import re, os, tempfile, unicodedata, urllib.request

DOWNLOAD_DIR = Path("/content/romans_Flaubert")
DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True)

FLAUBERT = {
    "Madame_Bovary": "https://www.gutenberg.org/cache/epub/14155/pg14155.txt",
}

START_PATTERNS = [
    re.compile(r'(?i)(?:\*+\s*)?(start|begin)(?:\s+of)?(?:\s+the)?\s+project\s+gutenberg'),
    re.compile(r'(?i)start\s+of\s+this\s+project\s+gutenberg'),
]
END_PATTERNS = [
    re.compile(r'(?i)(?:\*+\s*)?(end|finish|finis?h|stop)(?:\s+of)?(?:\s+the)?\s+project\s+gutenberg'),
    re.compile(r'(?i)end\s+of\s+this\s+project\s+gutenberg'),
]

def normalize(s): return unicodedata.normalize("NFKC", s).replace("\u00A0", " ")

def find_marker(lines, patterns, rev=False):
    rng = range(len(lines)-1, -1, -1) if rev else range(len(lines))
    for i in rng:
        if any(p.search(normalize(lines[i]).strip()) for p in patterns):
            return i
    return None

def gutenberg_strip_text(text):
    text = normalize(text).replace('\r\n', '\n')
    lines = text.splitlines(keepends=True)
    s = find_marker(lines, START_PATTERNS)
    e = find_marker(lines, END_PATTERNS, rev=True)
    if s is None and e is None: return text
    start = (s + 1) if s is not None else 0
    end = e if e is not None else len(lines)
    if start >= end: return text
    body = lines[start:end]
    while body and not body[0].strip(): body.pop(0)
    while body and not body[-1].strip(): body.pop()
    return ''.join(body)

def download_if_missing(url, dest):
    if dest.exists(): return
    try:
        urllib.request.urlretrieve(url, dest)
    except Exception:
        pass  # pas d'affichage d'erreur

def safe_write(path, content):
    with tempfile.NamedTemporaryFile("w", delete=False, encoding="utf-8", dir=path.parent) as tmp:
        tmp.write(content)
    os.replace(tmp.name, path)

def process(folder):
    for p in sorted(folder.glob("*.txt")):
        txt = p.read_text(encoding="utf-8", errors="replace")
        stripped = gutenberg_strip_text(txt)
        if stripped != txt:
            safe_write(p, stripped)

if __name__ == "__main__":
    for title, url in FLAUBERT.items():
        download_if_missing(url, DOWNLOAD_DIR / f"{title}.txt")
    process(DOWNLOAD_DIR)
    print("Découpage terminé. Voir le dossier /content/romans_Flaubert")

Découpage terminé. Voir le dossier /content/romans_Flaubert


In [12]:
#@title Extraction de toutes les entités nommées de Madame Bovary avec Camembert NER

!pip install -q transformers[torch] datasets sentencepiece nltk
!pip install -q --upgrade accelerate

from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
import nltk
nltk.download('punkt')
nltk.download('punkt_tab', quiet=True)
from nltk.tokenize import sent_tokenize
import json, csv, os
from tqdm.auto import tqdm

MODEL_NAME = "Jean-Baptiste/camembert-ner"
TXT_PATH = "/content/Madame_Bovary.txt"
OUTPUT_DIR = "/content/ner_bovary"
os.makedirs(OUTPUT_DIR, exist_ok=True)
DEVICE = 0

# Load model and pipeline
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
try:
    nlp = pipeline("ner", model=model, tokenizer=tokenizer, device=DEVICE, aggregation_strategy="simple")
except TypeError:
    nlp = pipeline("ner", model=model, tokenizer=tokenizer, device=DEVICE, grouped_entities=True)

# Read text and split into sentences
with open(TXT_PATH, "r", encoding="utf-8") as f:
    text = f.read()

sentences = sent_tokenize(text, language='french')
print(f"Total sentences: {len(sentences)}")

# Build sentence spans for mapping
sentence_spans = []
cursor = 0
for sent in sentences:
    start = text.find(sent, cursor)
    if start == -1:
        start = cursor
    end = start + len(sent)
    sentence_spans.append((sent, start, end))
    cursor = end

# Build chunks for processing
max_tokens = tokenizer.model_max_length
MARGIN = 32
chunks = []
chunk_starts = []
current_chunk = ""
current_start = None

for sent, s_start, s_end in sentence_spans:
    candidate = f"{current_chunk} {sent}".strip() if current_chunk else sent
    token_len = len(tokenizer(candidate, return_tensors="pt")["input_ids"][0])

    if token_len + MARGIN < max_tokens:
        if not current_chunk:
            current_start = s_start
        current_chunk = candidate
    else:
        if current_chunk:
            chunks.append(current_chunk)
            chunk_starts.append(current_start)
        current_chunk = sent
        current_start = s_start

if current_chunk:
    chunks.append(current_chunk)
    chunk_starts.append(current_start)

print(f"Total chunks: {len(chunks)}")

# Find sentence containing entity
def find_sentence(abs_start, abs_end):
    for s_text, s_start, s_end in sentence_spans:
        if s_start <= abs_start and abs_end <= s_end:
            return s_text.strip()
    return text[max(0, abs_start-200):min(len(text), abs_end+200)]

# Run NER and collect entities
all_entities = []
for chunk_text, chunk_start in tqdm(list(zip(chunks, chunk_starts)), desc="NER chunks"):
    results = nlp(chunk_text)
    for ent in results:
        start = ent.get("start")
        end = ent.get("end")
        if start is None or end is None:
            continue

        abs_start = chunk_start + start
        abs_end = chunk_start + end
        ent_type = ent.get("entity_group") or ent.get("entity") or ent.get("label")
        ent_text = ent.get("word", text[abs_start:abs_end])
        sentence = find_sentence(abs_start, abs_end)

        all_entities.append({
            "text": ent_text,
            "type": ent_type,
            "sentence": sentence,
            "key": (abs_start, abs_end, ent_type, ent_text)
        })

print(f"Raw entities: {len(all_entities)}")

# Deduplicate by keeping first occurrence of each unique entity
seen = set()
entities_dedup = []
for e in all_entities:
    if e["key"] not in seen:
        seen.add(e["key"])
        entities_dedup.append({"text": e["text"], "type": e["type"], "sentence": e["sentence"]})

print(f"Entities after dedup: {len(entities_dedup)}")

# Save entities
json_path = os.path.join(OUTPUT_DIR, "madame_bovary_ner.json")
csv_path = os.path.join(OUTPUT_DIR, "madame_bovary_ner.csv")

with open(json_path, "w", encoding="utf-8") as f:
    json.dump(entities_dedup, f, ensure_ascii=False, indent=2)

with open(csv_path, "w", encoding="utf-8", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=["text", "type", "sentence"])
    writer.writeheader()
    writer.writerows(entities_dedup)

print(f"Saved {len(entities_dedup)} entities to:")
print(f"  - {json_path}")
print(f"  - {csv_path}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Device set to use cuda:0
Token indices sequence length is longer than the specified maximum sequence length for this model (515 > 512). Running this sequence through the model will result in indexing errors


Total sentences: 6801
Total chunks: 383


NER chunks:   0%|          | 0/383 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Raw entities: 3109
Entities after dedup: 3109
Saved 3109 entities to:
  - /content/ner_bovary/madame_bovary_ner.json
  - /content/ner_bovary/madame_bovary_ner.csv


In [13]:
import json
from collections import Counter

# Charger les résultats de l'analyse NER
with open("/content/ner_bovary/madame_bovary_ner.json", "r", encoding="utf-8") as f:
    entities = json.load(f)
per_entities = [e["text"] for e in entities if e["type"] == "PER"]

entity_counts = Counter(per_entities)

# Trier par ordre décroissant de fréquence
sorted_entities = entity_counts.most_common()

print(sorted_entities)

[('Charles', 302), ('Emma', 199), ('Léon', 115), ('Homais', 103), ('Rodolphe', 95), ('Madame', 43), ('Lheureux', 42), ('M. Homais', 38), ('Justin', 36), ('Binet', 28), ('Félicité', 26), ('M. Lheureux', 24), ('Hippolyte', 24), ('Monsieur', 21), ('madame Homais', 18), ('Rouault', 16), ('Hivert', 15), ('Berthe', 15), ('Lefrançois', 14), ('madame Lefrançois', 14), ('père Rouault', 13), ('Tuvache', 13), ('Canivet', 13), ('or', 11), ('Rodol', 11), ('Vinçart', 11), ('Lestiboudois', 10), ('Conseiller', 10), ('Lion d', 9), ('Rolet', 9), ('M. Guillaumin', 9), ('M. Bournisien', 9), ('M. Canivet', 9), ('Bournisien', 9), ('Vicomte', 8), ('M. Léon', 8), ('Tostes', 7), ('M. Binet', 7), ('Voltaire', 7), ('Napoléon', 7), ('Athalie', 7), ('M. Bovary', 7), ('M. Boulanger', 7), ('madame Tuvache', 7), ('Bo', 7), ('Bridoux', 7), ('Aveugle', 7), ('Bertaux', 6), ('M.', 6), ('Artémise', 6), ('M. Tuvache', 6), ('Bovary', 6), ('Nastasie', 5), ('l', 5), ('Hirondelle', 5), ('Dieu', 5), ('madame', 5), ('L', 5), ('T

In [14]:
#@title Toutes les occurences de l'entité nommée "Emma" : répartitions des types (PER, LOC, MISC, etc.)

import json
import csv
import os

# Input paths
INPUT_DIR = "/content/ner_bovary"
JSON_PATH = os.path.join(INPUT_DIR, "madame_bovary_ner.json")
CSV_PATH = os.path.join(INPUT_DIR, "madame_bovary_ner.csv")

# Output paths
OUTPUT_DIR = "/content/ner_bovary"
EMMA_PER_JSON = os.path.join(OUTPUT_DIR, "emma_per_only.json")
EMMA_PER_CSV = os.path.join(OUTPUT_DIR, "emma_per_only.csv")
EMMA_ALL_JSON = os.path.join(OUTPUT_DIR, "emma_all_types.json")
EMMA_ALL_CSV = os.path.join(OUTPUT_DIR, "emma_all_types.csv")

# Load the data (try JSON first, fall back to CSV)
entities = []
try:
    with open(JSON_PATH, "r", encoding="utf-8") as f:
        entities = json.load(f)
    print(f"Loaded {len(entities)} entities from JSON")
except FileNotFoundError:
    print("JSON file not found, trying CSV...")
    with open(CSV_PATH, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        entities = list(reader)
    print(f"Loaded {len(entities)} entities from CSV")

# Filter 1: Only sentences containing "Emma" AND type is "PER"
emma_per_entities = [
    e for e in entities
    if "Emma" in e.get("sentence", "") and e.get("type") == "PER"
]

# Filter 2: All sentences containing "Emma" (any type)
emma_all_entities = [
    e for e in entities
    if "Emma" in e.get("sentence", "")
]

print(f"\nFilter results:")
print(f"  - Emma with type PER: {len(emma_per_entities)} entities")
print(f"  - Emma all types: {len(emma_all_entities)} entities")

# Check for non-PER types
non_per_types = set(e.get("type") for e in emma_all_entities if e.get("type") != "PER")
if non_per_types:
    print(f"  - Non-PER types found: {', '.join(sorted(non_per_types))}")

# Save Filter 1: Emma PER only
with open(EMMA_PER_JSON, "w", encoding="utf-8") as f:
    json.dump(emma_per_entities, f, ensure_ascii=False, indent=2)

with open(EMMA_PER_CSV, "w", encoding="utf-8", newline="") as f:
    if emma_per_entities:
        writer = csv.DictWriter(f, fieldnames=["text", "type", "sentence"])
        writer.writeheader()
        writer.writerows(emma_per_entities)

# Save Filter 2: Emma all types
with open(EMMA_ALL_JSON, "w", encoding="utf-8") as f:
    json.dump(emma_all_entities, f, ensure_ascii=False, indent=2)

with open(EMMA_ALL_CSV, "w", encoding="utf-8", newline="") as f:
    if emma_all_entities:
        writer = csv.DictWriter(f, fieldnames=["text", "type", "sentence"])
        writer.writeheader()
        writer.writerows(emma_all_entities)

Loaded 3109 entities from JSON

Filter results:
  - Emma with type PER: 331 entities
  - Emma all types: 562 entities
  - Non-PER types found: LOC, MISC


In [8]:
pip install stanza -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m1.4/1.7 MB[0m [31m46.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/608.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [15]:
import stanza
import json
import csv
import os
from collections import defaultdict

# Setup Stanza
print("Setting up Stanza for French...")
stanza.download('fr', verbose=False)
nlp = stanza.Pipeline('fr', verbose=False)

# Input paths
INPUT_DIR = "/content/ner_bovary"
EMMA_ALL_JSON = os.path.join(INPUT_DIR, "emma_all_types.json")
EMMA_ALL_CSV = os.path.join(INPUT_DIR, "emma_all_types.csv")

# Load Emma sentences (try JSON first, fall back to CSV)
emma_entities = []
try:
    with open(EMMA_ALL_JSON, "r", encoding="utf-8") as f:
        emma_entities = json.load(f)
    print(f"Loaded {len(emma_entities)} Emma entities from JSON")
except FileNotFoundError:
    print("JSON file not found, trying CSV...")
    with open(EMMA_ALL_CSV, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        emma_entities = list(reader)
    print(f"Loaded {len(emma_entities)} Emma entities from CSV")

# Extract unique sentences containing Emma
emma_sentences = list(set(e.get("sentence", "") for e in emma_entities if e.get("sentence")))
print(f"Found {len(emma_sentences)} unique sentences containing Emma")

# Process sentences with Stanza and extract adjectives
print("\nProcessing sentences with Stanza to extract adjectives...")
sentence_adjectives = []

for i, sentence in enumerate(emma_sentences, 1):
    if i % 50 == 0:
        print(f"  Processing sentence {i}/{len(emma_sentences)}...")

    doc = nlp(sentence)

    # Extract adjectives (POS tag = ADJ)
    adjectives = []
    for sent in doc.sentences:
        for word in sent.words:
            if word.upos == "ADJ":
                adjectives.append(word.text)

    if adjectives:
        sentence_adjectives.append({
            "sentence": sentence,
            "adjectives": adjectives,
            "adjective_count": len(adjectives)
        })

print(f"\nResults:")
print(f"  - Sentences with adjectives: {len(sentence_adjectives)}")
print(f"  - Total adjectives found: {sum(s['adjective_count'] for s in sentence_adjectives)}")

# Display sample results
print("\nSample sentences with adjectives:")
for i, item in enumerate(sentence_adjectives[:5], 1):
    print(f"\n{i}. Adjectives: {', '.join(item['adjectives'])}")
    print(f"   Sentence: {item['sentence'][:100]}{'...' if len(item['sentence']) > 100 else ''}")

# Save results
OUTPUT_JSON = os.path.join(INPUT_DIR, "emma_sentences_adjectives.json")
OUTPUT_CSV = os.path.join(INPUT_DIR, "emma_sentences_adjectives.csv")

with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
    json.dump(sentence_adjectives, f, ensure_ascii=False, indent=2)

# For CSV, join adjectives as comma-separated string
with open(OUTPUT_CSV, "w", encoding="utf-8", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=["sentence", "adjectives", "adjective_count"])
    writer.writeheader()
    for item in sentence_adjectives:
        writer.writerow({
            "sentence": item["sentence"],
            "adjectives": ", ".join(item["adjectives"]),
            "adjective_count": item["adjective_count"]
        })

print(f"\nSaved results to:")
print(f"  - {OUTPUT_JSON}")
print(f"  - {OUTPUT_CSV}")

Setting up Stanza for French...


KeyboardInterrupt: 

In [1]:
!python3 -m spacy download fr_dep_news_trf

Collecting fr-dep-news-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_dep_news_trf-3.8.0/fr_dep_news_trf-3.8.0-py3-none-any.whl (397.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m397.7/397.7 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting spacy-curated-transformers<1.0.0,>=0.2.2 (from fr-dep-news-trf==3.8.0)
  Downloading spacy_curated_transformers-0.3.1-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting protobuf<3.21.0 (from fr-dep-news-trf==3.8.0)
  Downloading protobuf-3.20.3-py2.py3-none-any.whl.metadata (720 bytes)
Collecting curated-transformers<0.2.0,>=0.1.0 (from spacy-curated-transformers<1.0.0,>=0.2.2->fr-dep-news-trf==3.8.0)
  Downloading curated_transformers-0.1.1-py2.py3-none-any.whl.metadata (965 bytes)
Collecting curated-tokenizers<0.1.0,>=0.0.9 (from spacy-curated-transformers<1.0.0,>=0.2.2->fr-dep-news-trf==3.8.0)
  Downloading curated_tokenizers-0.0.9-cp312-cp312-manylinux_2_17_x86_64.manyli

Flaubert fait des phrases complexes avec des cascades de subordonnées relatives, c'est difficile de les décortiquer automatiquement et de savoir quel adjectif se rapporte à quel sujet

In [9]:
import spacy

# Choisis ton modèle : "fr_dep_news_trf" (meilleur si dispo) ou "fr_core_news_md"
nlp = spacy.load("fr_dep_news_trf")

def expand_with_conjuncts(token):
    """Retourne token + ses conjoints (ex. 'Pierre et Paul' -> Pierre, Paul)."""
    results = {token}
    head = token.head
    # les tokens reliés par dep_ 'conj' qui partagent le même head (ou qui ont token comme head)
    for child in head.children:
        if child.dep_ == "conj":
            # cas où target est conj ou où le conjoncteur partage le head
            results.add(child)
    # cas où token lui-même est marqué 'conj' : rajoute son head si c'est un nom
    if token.dep_ == "conj" and token.head.pos_ in {"NOUN", "PROPN", "PRON"}:
        results.add(token.head)
    return list(results)

def find_noun_chunk_for(token, doc):
    """Si le modèle fournit noun_chunks, on essaye de récupérer le syntagme nominal
       contenant le token (plus lisible qu'un token isolé)."""
    if not hasattr(doc, "noun_chunks"):
        return None
    for nc in doc.noun_chunks:
        if token.i >= nc.start and token.i < nc.end:
            return nc
    return None

def subjects_for_predicative_adj(adj):
    """
    Trouve le(s) sujet(s) pertinent(s) si l'adjectif est prédicatif (attaché à un verbe).
    Cherche:
      - nsubj enfants du verbe head
      - si pas trouvé, cherche un ancêtre nominal
      - heuristique: recherche de noms proches à gauche
    """
    verb = adj.head
    subs = [c for c in verb.children if c.dep_.startswith("nsubj")]  # 'nsubj', 'nsubj:pass', ...
    if subs:
        return subs
    # si verbe est copule (ex. 'est') mais pas de nsubj trouvé, regarder ancestors
    for anc in adj.ancestors:
        if anc.pos_ in {"NOUN", "PROPN", "PRON"}:
            return [anc]
    # heuristique : chercher un nom parmi les tokens précédents dans une petite fenêtre
    left_window = [t for t in adj.doc[max(0, adj.i-4):adj.i] if t.pos_ in {"NOUN","PROPN","PRON"}]
    if left_window:
        return [left_window[-1]]
    return []

def adjective_to_subjects(doc):
    """
    Parcourt les adjectifs et retourne pour chacun:
      - l'adjectif,
      - sa relation ('attributif' ou 'prédicatif' ou 'inconnu'),
      - la/les cibles (sujet/nom) sous forme de spans lisibles.
    """
    results = []
    for token in doc:
        if token.pos_ != "ADJ":
            continue

        # 1) cas attributif direct : adjectif modifie un nom (ex. "chat blanc")
        if token.dep_ == "amod" and token.head.pos_ in {"NOUN", "PROPN", "PRON"}:
            targets = expand_with_conjuncts(token.head)
            relation = "attributif (amod)"
            results.append((token, relation, targets))
            continue

        # 1b) cas adjectif conjoncté à un autre adjectif
        if token.dep_ == "conj" and token.head.pos_ == "ADJ":
            # on récupère la cible de l'adj chef si possible
            head_adj = token.head
            # si le chef modifie un nom directement -> même cible
            if head_adj.dep_ == "amod" and head_adj.head.pos_ in {"NOUN","PROPN","PRON"}:
                targets = expand_with_conjuncts(head_adj.head)
                relation = "attributif (via conjonction)"
                results.append((token, relation, targets))
                continue
            # si le chef est prédicatif (attaché à un verbe)
            if head_adj.head.pos_ == "VERB":
                subs = subjects_for_predicative_adj(head_adj)
                if subs:
                    relation = "prédicatif (via conjonction)"
                    results.append((token, relation, subs))
                    continue

        # 2) cas prédicatif : l'adjectif est lié à un verbe (ex. 'sont heureux')
        if token.head.pos_ == "VERB" or token.dep_ in {"acomp", "attr"}:
            subs = subjects_for_predicative_adj(token)
            if subs:
                relation = "prédicatif (lié au verbe '{}')".format(token.head.lemma_)
                results.append((token, relation, subs))
                continue

        # 3) heuristique : chercher un ancêtre nominal
        noun_anc = None
        for anc in token.ancestors:
            if anc.pos_ in {"NOUN", "PROPN", "PRON"}:
                noun_anc = anc
                break
        if noun_anc:
            targets = expand_with_conjuncts(noun_anc)
            relation = "probablement attributif (via ancêtre nominal)"
            results.append((token, relation, targets))
            continue

        # 4) dernier recours: chercher un nom proche à gauche
        left_window = [t for t in doc[max(0, token.i-4):token.i] if t.pos_ in {"NOUN","PROPN","PRON"}]
        if left_window:
            targets = expand_with_conjuncts(left_window[-1])
            relation = "heuristique (nom proche à gauche)"
            results.append((token, relation, targets))
            continue

        # 5) rien trouvé
        results.append((token, "inconnu", []))
    return results

if __name__ == "__main__":
    texte = "Malgré les explications d’Emma, dès le duo récitatif où Gilbert expose à son maître Ashton ses abominables manœuvres, Charles, en voyant le faux anneau de fiançailles qui doit abuser Lucie, crut que c’était un souvenir d’amour envoyé par Edgar."
    doc = nlp(texte)

    infos = adjective_to_subjects(doc)
    for adj, relation, targets in infos:
        # essayer d'obtenir un syntagme nominal si possible
        target_texts = []
        for t in targets:
            nc = find_noun_chunk_for(t, doc)
            if nc:
                target_texts.append(f"'{nc.text}' (tokens {nc.start}-{nc.end-1})")
            else:
                target_texts.append(f"'{t.text}' (index {t.i})")
        if not target_texts:
            target_texts = ["<aucun sujet trouvé>"]

        print(f"Adjectif: '{adj.text}' (index {adj.i}, dep={adj.dep_})")
        print(f"  → Rôle détecté : {relation}")
        print(f"  → Rapporté à : {', '.join(target_texts)}")
        print()

Adjectif: 'récitatif' (index 9, dep=amod)
  → Rôle détecté : attributif (amod)
  → Rapporté à : 'le duo récitatif' (tokens 7-9)

Adjectif: 'abominables' (index 18, dep=amod)
  → Rôle détecté : attributif (amod)
  → Rapporté à : 'ses abominables manœuvres' (tokens 17-19)

Adjectif: 'faux' (index 26, dep=amod)
  → Rôle détecté : attributif (amod)
  → Rapporté à : 'le faux anneau' (tokens 25-27)



In [10]:
#@title Détecter à quel sujet chaque adjectif se rapporte - cas limite

texte = "Puis, s'adressant à Emma, qui portait une robe de soie bleue à quatre falbalas: — Je vous trouve jolie comme un Amour!"
doc = nlp(texte)

infos = adjective_to_subjects(doc)
for adj, relation, targets in infos:
    target_texts = [f"'{find_noun_chunk_for(t, doc).text}'" if find_noun_chunk_for(t, doc) else f"'{t.text}'"
                    for t in targets] or ["<aucun sujet trouvé>"]

    print(f"Adjectif: '{adj.text}' (dep={adj.dep_})")
    print(f"  → Rôle: {relation}")
    print(f"  → Rapporté à: {', '.join(target_texts)}\n")

Adjectif: 'bleue' (dep=amod)
  → Rôle: attributif (amod)
  → Rapporté à: 'une robe'

Adjectif: 'jolie' (dep=xcomp)
  → Rôle: prédicatif (lié au verbe 'trouver')
  → Rapporté à: 'Je'



In [16]:
import os
import json
import csv
from typing import List
import spacy

# ------------------------------
# Your functions (kept as-is, with a small addition for appos)
# ------------------------------
def expand_with_conjuncts(token):
    results = {token}
    head = token.head
    # les tokens reliés par dep_ 'conj' qui partagent le même head (ou qui ont token comme head)
    for child in head.children:
        if child.dep_ == "conj":
            # cas où target est conj ou où le conjoncteur partage le head
            results.add(child)
    # cas où token lui-même est marqué 'conj' : rajoute son head si c'est un nom
    if token.dep_ == "conj" and token.head.pos_ in {"NOUN", "PROPN", "PRON"}:
        results.add(token.head)
    return list(results)


def find_noun_chunk_for(token, doc):
    """Si le modèle fournit noun_chunks, on essaye de récupérer le syntagme nominal
       contenant le token (plus lisible qu'un token isolé)."""
    if not hasattr(doc, "noun_chunks"):
        return None
    for nc in doc.noun_chunks:
        if token.i >= nc.start and token.i < nc.end:
            return nc
    return None


def subjects_for_predicative_adj(adj):
    """
    Find the relevant subject(s) if the adjective is predicative (attached to a verb).
    Cherche:
      - nsubj enfants du verbe head
      - si pas trouvé, cherche un ancêtre nominal
      - heuristique: recherche de noms proches à gauche
    """
    verb = adj.head
    subs = [c for c in verb.children if c.dep_.startswith("nsubj")]  # 'nsubj', 'nsubj:pass', ...
    if subs:
        return subs
    # si verbe est copule (ex. 'est') mais pas de nsubj trouvé, regarder ancestors
    for anc in adj.ancestors:
        if anc.pos_ in {"NOUN", "PROPN", "PRON"}:
            return [anc]
    # heuristique : chercher un nom parmi les tokens précédents dans une petite fenêtre
    left_window = [t for t in adj.doc[max(0, adj.i-4):adj.i] if t.pos_ in {"NOUN","PROPN","PRON"}]
    if left_window:
        return [left_window[-1]]
    return []


def adjective_to_subjects(doc):
    """
    Parcourt les adjectifs et retourne pour chacun:
      - l'adjectif,
      - sa relation ('attributif' ou 'prédicatif' ou 'inconnu'),
      - la/les cibles (sujet/nom) sous forme de tokens.
    """
    results = []
    for token in doc:
        if token.pos_ != "ADJ":
            continue

        # 1) cas attributif direct : adjectif modifie un nom (ex. "chat blanc")
        if token.dep_ == "amod" and token.head.pos_ in {"NOUN", "PROPN", "PRON"}:
            targets = expand_with_conjuncts(token.head)
            relation = "attributif (amod)"
            results.append((token, relation, targets))
            continue

        # 1b) cas adjectif conjoncté à un autre adjectif
        if token.dep_ == "conj" and token.head.pos_ == "ADJ":
            # on récupère la cible de l'adj chef si possible
            head_adj = token.head
            # si le chef modifie un nom directement -> même cible
            if head_adj.dep_ == "amod" and head_adj.head.pos_ in {"NOUN","PROPN","PRON"}:
                targets = expand_with_conjuncts(head_adj.head)
                relation = "attributif (via conjonction)"
                results.append((token, relation, targets))
                continue
            # si le chef est prédicatif (attaché à un verbe)
            if head_adj.head.pos_ == "VERB":
                subs = subjects_for_predicative_adj(head_adj)
                if subs:
                    relation = "prédicatif (via conjonction)"
                    results.append((token, relation, subs))
                    continue

        # 1c) cas appositif (ex. "Emma, triste, ...")
        if token.dep_ == "appos" and token.head.pos_ in {"NOUN", "PROPN", "PRON"}:
            targets = expand_with_conjuncts(token.head)
            relation = "attributif (appos)"
            results.append((token, relation, targets))
            continue

        # 2) cas prédicatif : l'adjectif est lié à un verbe (ex. 'sont heureux')
        if token.head.pos_ == "VERB" or token.dep_ in {"acomp", "attr"}:
            subs = subjects_for_predicative_adj(token)
            if subs:
                relation = "prédicatif (lié au verbe '{}')".format(token.head.lemma_)
                results.append((token, relation, subs))
                continue

        # 3) heuristique : chercher un ancêtre nominal
        noun_anc = None
        for anc in token.ancestors:
            if anc.pos_ in {"NOUN", "PROPN", "PRON"}:
                noun_anc = anc
                break
        if noun_anc:
            targets = expand_with_conjuncts(noun_anc)
            relation = "probablement attributif (via ancêtre nominal)"
            results.append((token, relation, targets))
            continue

        # 4) dernier recours: chercher un nom proche à gauche
        left_window = [t for t in doc[max(0, token.i-4):token.i] if t.pos_ in {"NOUN","PROPN","PRON"}]
        if left_window:
            targets = expand_with_conjuncts(left_window[-1])
            relation = "heuristique (nom proche à gauche)"
            results.append((token, relation, targets))
            continue

        # 5) rien trouvé
        results.append((token, "inconnu", []))
    return results


# ------------------------------
# Helpers to decide if a target refers to Emma (or 'elle')
# ------------------------------
EMMA_FORMS = {"emma"}
SHE_FORMS = {"elle"}  # we accept 'elle' as referring to Emma per your assumption

def is_she_pronoun(token):
    # Strictly accept the form "elle" (case-insensitive). If needed, loosen with morph features.
    if token.pos_ != "PRON":
        return False
    txt = token.text.lower()
    if txt in SHE_FORMS or txt.startswith("elle-") or txt.replace("’", "'").startswith("elle'"):
        return True
    # Morphological fallback: feminine, singular, 3rd person
    g = token.morph.get("Gender")
    n = token.morph.get("Number")
    p = token.morph.get("Person")
    return ("Fem" in g) and ("Sing" in n) and ("3" in "".join(p))

def is_token_emma(token):
    return token.text.lower() in EMMA_FORMS or token.lemma_.lower() in EMMA_FORMS

def in_same_appellation_as_emma(target):
    """
    Heuristic for cases like 'Mademoiselle Emma' where the head might be 'Mademoiselle'
    and 'Emma' is attached via 'flat', 'appos', etc. Avoids counting 'mère d’Emma'.
    """
    # Direct name
    if is_token_emma(target):
        return True

    # Check immediate 'name-like' relations
    name_like_labels = ("flat", "flat:name", "appos", "compound", "name")
    for child in target.children:
        if is_token_emma(child) and (child.dep_ in name_like_labels or child.dep_.startswith("flat")):
            return True
    if target.dep_ in name_like_labels or target.dep_.startswith("flat"):
        if is_token_emma(target.head):
            return True

    # Quick adjacency heuristic: token immediately followed by 'Emma' (e.g., "Mademoiselle Emma")
    try:
        next_tok = target.nbor(1)
        if is_token_emma(next_tok):
            return True
    except Exception:
        pass

    return False

def target_refers_to_emma(target, doc):
    """Return True if the target token is Emma or 'elle', or if the target is in a proper-name span with 'Emma'."""
    # 'Emma' directly
    if is_token_emma(target):
        return True

    # 'Elle' as subject pronoun (assumed to be Emma)
    if is_she_pronoun(target):
        return True

    # If target is a NOUN/PROPN in the same 'name' span as Emma (e.g., 'Mademoiselle Emma')
    if in_same_appellation_as_emma(target):
        return True

    # If within a noun chunk that has Emma as part of the name (not as genitive 'de Emma')
    nc = find_noun_chunk_for(target, doc)
    if nc:
        root = nc.root
        for t in nc:
            if is_token_emma(t):
                # Accept only if Emma functions as the head or as 'flat/appos' inside the same name
                # Reject genitives like 'mère d’Emma'
                if t == root or t.dep_.startswith("flat") or t.dep_ in {"appos", "compound", "name"}:
                    return True

    return False


# ------------------------------
# IO helpers
# ------------------------------
def read_sentences(json_path: str, csv_path: str) -> List[str]:
    sentences = []

    # Prefer JSON if it exists (the 'sentence' field), otherwise fallback to CSV 3rd column
    if os.path.exists(json_path):
        with open(json_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        for item in data:
            if isinstance(item, dict) and "sentence" in item:
                sent = item["sentence"].strip()
                if sent:
                    sentences.append(sent)

    if not sentences and os.path.exists(csv_path):
        with open(csv_path, "r", encoding="utf-8") as f:
            reader = csv.reader(f)
            for row in reader:
                if len(row) >= 3:
                    sent = row[2].strip()
                    if sent:
                        sentences.append(sent)

    # deduplicate while preserving order
    seen = set()
    unique_sentences = []
    for s in sentences:
        if s not in seen:
            unique_sentences.append(s)
            seen.add(s)
    return unique_sentences


def write_results(rows, out_path: str):
    # rows: list of dicts with keys 'sentence', 'adjective'
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    with open(out_path, "w", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=["sentence", "adjective"])
        writer.writeheader()
        writer.writerows(rows)


# ------------------------------
# Main extraction
# ------------------------------
def extract_emma_adjectives(
    json_path="/content/ner_bovary/emma.json",
    csv_path="/content/ner_bovary/emma.csv",
    out_path="/content/ner_bovary/emma_adjectives.csv",
    batch_size=32
):
    sentences = read_sentences(json_path, csv_path)
    if not sentences:
        raise FileNotFoundError("No sentences found. Check paths to JSON/CSV.")

    # Load spaCy French transformer model (NER not needed here)
    nlp = spacy.load("fr_dep_news_trf", disable=["ner"])

    results = []
    for doc in nlp.pipe(sentences, batch_size=batch_size):
        # For each adjective in the doc, find its targets and keep only those that refer to Emma
        triples = adjective_to_subjects(doc)
        for adj_token, relation, targets in triples:
            # Keep if any of the targets is Emma or 'elle'
            if any(target_refers_to_emma(t, doc) for t in targets):
                results.append({
                    "sentence": doc.text,
                    "adjective": adj_token.text
                })

    write_results(results, out_path)
    print(f"Processed {len(sentences)} passages; found {len(results)} Emma-linked adjectives.")
    print(f"Saved to {out_path}")


if __name__ == "__main__":
    extract_emma_adjectives()

Processed 428 passages; found 55 Emma-linked adjectives.
Saved to /content/ner_bovary/emma_adjectives.csv


In [21]:
import polars as pl

df = pl.read_csv("/content/ner_bovary/emma_adjectives.csv")
col = df.select(df.columns[1])
for val in col.to_series():
    print(val)

pleine
bonne
toute
silencieuse
silencieuse
disposée
difficile
capricieuse
première
première
tels
faible
longue
amoureuse
toute
seule
laide
joyeuse
silencieuse
silencieuse
silencieuse
pareilles
pareilles
prête
courte
courte
pauvre
anxieuse
fraîche
embarrass
toutes
docile
docile
agonisante
tous
invincible
pâle
pâle
blanche
seule
ivre
ivre
telle
enflammée
avide
folle
bonne
agitée
faible
faible
mignonne
seule
pâle
désespérée
petite
