# EXTRACTION DE KEYWORDS POUR L'AMOUR ATYPIQUE
### Corpus CAMille - Presse belge francophone (1831-1993)


## Imports

In [1]:
import os
import re 
import yake
import pandas as pd
from tqdm import tqdm
import spacy
from collections import defaultdict


In [2]:
# --- CONFIGURATION ---
data_path = "../../data/txt_tp4"  # Chemin vers vos fichiers .txt
output_csv = "../../data/results/keywords_sorcieres_1951_1993.csv"
output_plot = "../../data/results/evolution_sorcieres_1951_1993.png"

In [3]:
# Initialisation de YAKE (extraction de keywords)
kw_extractor = yake.KeywordExtractor(
    lan="fr",
    n=3,               # Trigrammes (ex: "sorci√®re f√©ministe")
    dedupLim=0.7,      # Seuil moins strict pour garder des variantes
    dedupFunc='seqm',
    windowsSize=1,
    top=100,           # Top 100 candidats (filtr√©s ensuite)
    stopwords=None     # D√©sactiv√© pour ne pas exclure "femme", "magie", etc.
)

In [7]:
def nettoyer_texte(text):
    """Nettoyage adapt√© aux textes historiques (OCR + accents)."""
    text = re.sub(r'[^\w\s\'\-√†√¢√§√©√®√™√´√Æ√Ø√¥√∂√π√ª√º√ß]', ' ', text)  # Garde les accents
    text = re.sub(r'\b\w{1,2}\b', '', text)  # Supprime mots <3 lettres (ex: "xii")
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'\b([A-Z]{2,}|hmu|xii|xix|xxe|xxx)\b', '', text, flags=re.IGNORECASE)  # Artefacts OCR
    return text

def filtrer_keywords(keywords, text_nettoye):
    """Filtre les keywords pertinents pour les sorci√®res + f√©minit√©.
    Args:
        keywords: Liste de tuples (mot, score) issus de YAKE.
        text_nettoye: Texte pr√©-nettoy√© pour v√©rification contextuelle.
    Returns:
        Liste filtr√©e et tri√©e des keywords pertinents.
    """
    # --- MOTS-CIBLES ---
mots_cibles = {

        "sorci√®re": ["sorci√®res", "sorcellerie", "sorcier", "sorciers", "magie noire",
                     "envo√ªtement", "envo√ªteuse", "sortil√®ge", "mal√©fice", "pouvoirs occultes",
                     "sabbat", "grimoire", "rituel magique"],

        "f√©minin": ["femme", "femmes", "f√©minit√©", "f√©minisme", "f√©ministes", "MLF",
                    "sage-femme", "gu√©risseuse", "herboriste", "matriarcat",
                    "sororit√©", "√©mancipation"],

        "religion": ["diable", "d√©mon", "satan", "exorcisme", "√©glise", "catholique",
                     "h√©r√©sie", "inquisition"],

        "justice": ["proc√®s", "tribunal", "accusation", "torture", "b√ªcher", "ex√©cution",
                    "ch√¢timent", "victime", "aveu forc√©", "chasse aux sorci√®res"],

        "folklore": ["l√©gende", "mythe", "conte", "tradition populaire", "potion",
                     "incantation", "macrale"],

        "stigmate": ["marginale", "rejet", "exclusion", "tabou", "interdit",
                     "immoral", "d√©viant", "bizarre", "folie", "hyst√©rie"],

        "f√©minisme": ["r√©appropriation", "symbole f√©minin", "pouvoir des femmes",
                       "lib√©ration", "mythologie f√©minine"],

        "m√©dias": ["film", "cin√©ma", "th√©√¢tre", "litt√©rature", "halloween"],

        "exclusions": ["f√©mur", "parfum", "mode", "jeu", "enfant"]
    }

    # --- MOTS √Ä EXCLURE ---
mots_exclus = {
        "film", "cin√©ma", "th√©√¢tre", "costume", "halloween",
        "carnaval", "bd", "dessins anim√©s",
        "rue", "place", "maison", "prix", "notaire",
        "transport", "ville"
    }

    # Pr√©pare une liste plate de toutes les variantes des mots-cibles
    liste_cibles = {mot for liste in mots_cibles.values() for mot in liste}

 keywords_filtres = []
 for kw, score in keywords:
        kw_lower = kw.lower()
        # 1. Exclure les mots non pertinents
        if any(exclu in kw_lower for exclu in mots_exclus):
            continue
        # 2. Prioriser les mots-cibles (score boost√©)
        if any(cible in kw_lower for cible in mots_cibles):
            keywords_filtres.append((kw, float(score) * 1.5))  # Boost
        # 3. Garde les bigrammes/trigrammes neutres (ex: "femme accus√©e")
        elif len(kw.split()) >= 2 and ("femme" in kw_lower or "sorci√®re" in kw_lower):
            keywords_filtres.append((kw, float(score) * 1.0))

    # Tri par score d√©croissant + limite √† 10 r√©sultats max par article
 return sorted(keywords_filtres, key=lambda x: x[1], reverse=True)[:10]

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 59)

In [8]:
# --- MOTS-CIBLES √âTENDUS (STRUCTUR√âS PAR TH√àMES) ---
mots_cibles = {
    # --- 1. SORCI√àRE : TERMES CENTRAUX ---
    "sorci√®re": [
        "sorci√®res", "sorcellerie", "sorcier", "sorciers", "magie noire", "magie blanche",
        "envo√ªtement", "envo√ªteuse", "sortil√®ge", "mal√©fice", "pouvoirs occultes",
        "pacte diabolique", "sabbat", "vol nocturne", "grimoire", "rituel magique",
        "incantation", "potion", "chaudron", "balai", "familier", "d√©monologie",
        "macrale", "tchestia", "djinn", "vaudou", "chamanisme", "n√©cromancie"
    ],
    # --- 2. F√âMININ/F√âMINISME ---
    "f√©minin": [
        "femme", "femmes", "f√©minin", "f√©minit√©", "f√©minisme", "f√©ministes", "MLF",
        "s≈ìur", "m√®re", "grand-m√®re", "vieille femme", "veuve", "c√©libataire",
        "sage-femme", "gu√©risseuse", "herboriste", "accoucheuse", "matriarcat",
        "sororit√©", "√©mancipation", "autonomie f√©minine", "pouvoir des femmes",
        "lib√©ration f√©minine", "witch feminism", "sorci√®re f√©ministe",
        "r√©appropriation", "symbole f√©minin", "d√©esse", "triple d√©esse"
    ],
    # --- 3. RELIGION & DIABOLIQUE ---
    "religion": [
        "diable", "d√©mon", "satan", "lucifer", "belz√©buth", "poss√©d√©", "exorcisme",
        "√©glise", "catholique", "pr√™tre", "cur√©", "confession", "p√©ch√©", "h√©r√©sie",
        "inquisition", "condamnation religieuse", "mal√©diction", "sainte", "vierge",
        "miracle", "superstition", "culte pa√Øen", "sabbath", "messire",
        "excommunication", "rituel satanique", "culte du diable", "pacte avec le malin"
    ],
    # --- 4. JUSTICE & PERS√âCUTION ---
    "justice": [
        "proc√®s", "tribunal", "jugement", "accusation", "torture", "b√ªcher", "ex√©cution",
        "ch√¢timent", "loi", "code p√©nal", "d√©nonciation", "victime", "innocente",
        "culpable", "preuve", "aveu forc√©", "chasse aux sorci√®res", "inquisiteur",
        "bourreau", "autodaf√©", "condamnation", "peine de mort", "tribunal eccl√©siastique",
        "question"  # "question" = torture
    ],
    # --- 5. FOLKLORE & TRADITIONS ---
    "folklore": [
        "l√©gende", "mythe", "conte", "tradition populaire", "croyance", "superstition",
        "potion", "incantation", "macrale", "tchestia", "loup-garou", "f√©e", "lutin",
        "esprit", "revenant", "spectre", "fant√¥me", "chaman", "druide", "alchimie",
        "astrologie", "cartomancie", "chiromancie", "n√©cromancie", "vaudou"
    ],
    # --- 6. STIGMATISATION ---
    "stigmate": [
        "marginale", "rejet", "exclusion", "tabou", "interdit", "immoral", "d√©viant",
        "bizarre", "folie", "hyst√©rie", "d√©moniaque", "poss√©d√©e", "mal√©fique", "maudite",
        "honte", "scandale", "d√©shonneur", "inf√¢me", "monstrueuse", "anormale",
        "dangereuse", "subversive", "r√©prouv√©e", "excommuni√©e", "h√©r√©tique", "soup√ßonn√©e"
    ],
    # --- 7. MOTS √Ä EXCLURE (BRUIT) ---
    "exclusions": [
        "f√©mur", "parfum", "mode", "jeu", "enfant", "jouet", "d√©guisement", "f√™te",
        "carnaval", "maquillage", "chic", "√©l√©gant", "coquet", "recette", "cuisine",
        "m√©t√©orologie", "temp√™te", "orage", "sorcier",  # Exclure "sorcier" si focus sur les femmes
        "film", "cin√©ma", "th√©√¢tre", "costume", "d√©cor", "sc√®ne", "roman", "livre",
        "bd", "dessins anim√©s", "s√©rie", "feuilleton", "fiction", "personnage"
    ]
}

# --- INITIALISATION YAKE (OPTIMIS√â POUR VOTRE CORPUS) ---
kw_extractor = yake.KeywordExtractor(
    lan="fr",
    n=3,               # Trigrammes (ex: "sorci√®re f√©ministe")
    dedupLim=0.7,      # Seuil pour √©viter les doublons
    dedupFunc='seqm',  # M√©thode de d√©duplication
    windowsSize=1,     # Fen√™tre r√©duite pour √©viter le bruit
    top=200,           # Top 200 candidats (filtr√©s ensuite)
    stopwords=None     # D√©sactiv√© pour garder "femme", "magie", etc.
)

# --- FONCTIONS OPTIMIS√âES POUR YAKE SEUL ---

def nettoyer_texte(text):
    """Nettoyage adapt√© aux textes historiques (OCR + accents)."""
    text = re.sub(r'[^\w\s\'\-√†√¢√§√©√®√™√´√Æ√Ø√¥√∂√π√ª√º√ß]', ' ', text)  # Garde les accents
    text = re.sub(r'\b\w{1,2}\b', '', text)  # Supprime mots <3 lettres (ex: "xii")
    text = re.sub(r'\s+', ' ', text).strip()
    # Supprime artefacts OCR et termes parasites
    text = re.sub(
        r'\b([A-Z]{2,}|hmu|xii|xix|xxe|xxx|pp?\.\s?\d+|le\soir|bruxelles|belge|franc)\b',
        '', text, flags=re.IGNORECASE
    )
    return text

def extraire_keywords_yake(text, top_n=10):
    """Extraction de keywords avec YAKE + filtrage par mots-cibles."""
    text_nettoye = nettoyer_texte(text)
    keywords_yake = kw_extractor.extract_keywords(text_nettoye)

    # Filtre et boost des mots-cibles
    keywords_filtres = []
    for kw, score in keywords_yake:
        kw_lower = kw.lower()
        score_final = float(score)

        # Boost des mots pertinents (√ó1.5)
        for categorie, mots in mots_cibles.items():
            if categorie != "exclusions" and any(mot in kw_lower for mot in mots):
                score_final *= 1.5
                break
            # Exclusion des termes bruyants
            elif categorie == "exclusions" and any(mot in kw_lower for mot in mots):
                score_final = 0  # Exclure

        # Garde seulement les keywords pertinents (score > 0, longueur > 3, pas de chiffres)
        if (score_final > 0 and
            len(kw.split()) >= 1 and
            len(kw) > 3 and
            not any(c.isdigit() for c in kw)):
            keywords_filtres.append((kw, score_final))

    # Tri et retour des top_n r√©sultats
    return sorted(keywords_filtres, key=lambda x: x[1], reverse=True)[:top_n]

In [9]:
def analyser_corpus(data_path):
    """Analyse tous les fichiers .txt dans le dossier sp√©cifi√©."""
    results = []
    for fichier in tqdm(os.listdir(data_path), desc="Analyse des fichiers"):
        if fichier.endswith(".txt"):
            with open(os.path.join(data_path, fichier), 'r', encoding='utf-8', errors='ignore') as f:
                text = f.read()
                # Extraction de la date depuis le nom de fichier (ex: "KB_LeSoir_1975-03-15.txt")
                date_match = re.search(r'(\d{4}-\d{2}-\d{2})', fichier)
                date = date_match.group(1) if date_match else "1900-01-01"  # Date par d√©faut si introuvable

                keywords = extraire_keywords_yake(text)
                if keywords:  # Ne garder que les fichiers avec des keywords pertinents
                    results.append({
                        "fichier": fichier,
                        "date": date,
                        "top_keywords": keywords
                    })
    return results

In [123]:
# --- EX√âCUTION & EXPORT ---
if __name__ == "__main__":
    print("üîç D√©but de l'analyse du corpus...")
    results = analyser_corpus(data_path)

    # Export CSV
    df = pd.DataFrame([
        {
            "fichier": r["fichier"],
            "date": r["date"],
            "keywords": "; ".join([f"{kw[0]} ({kw[1]:.4f})" for kw in r["top_keywords"]])
        }
        for r in results
    ])
    df.to_csv(output_csv, index=False, encoding='utf-8-sig', sep=';')
    print(f" Export CSV termin√© ({len(df)} entr√©es) ‚Üí {output_csv}")

üîç D√©but de l'analyse du corpus...


Analyse des fichiers: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 999/999 [1:59:54<00:00,  7.20s/it]    

 Export CSV termin√© (999 entr√©es) ‚Üí ../../data/results/keywords_sorcieres_1951_1993.csv





In [10]:
    # Visualisation (si assez de donn√©es)
    if len(df) > 10:
        df['date'] = pd.to_datetime(df['date'], errors='coerce')
        df = df.dropna(subset=['date'])
        if len(df) > 1:
            plt.figure(figsize=(12, 6))
            df.set_index('date').resample('5Y').size().plot(
                kind='line',
                marker='o',
                color='#6a0dad',  # Violet (symbolique sorci√®re/f√©minisme)
                linewidth=2,
                title="√âvolution des mentions de sorci√®res dans 'Le Soir' (1951‚Äì1993)",
                xlabel="P√©riode (5 ans)",
                ylabel="Nombre d'articles"
            )
            plt.grid(True, linestyle='--', alpha=0.6)
            plt.tight_layout()
            plt.savefig(output_plot, dpi=300, bbox_inches='tight')
            plt.show()
            print(f" Graphique sauvegard√© ‚Üí {output_plot}")
        else:
            print(" Pas assez de donn√©es pour g√©n√©rer un graphique fiable.")
    else:
        print(" Aucun r√©sultat pertinent trouv√©. V√©rifiez les chemins ou ajustez les mots-cibles.")

NameError: name 'df' is not defined