# Word Embeddings : le modèle Word2Vec

## Imports

In [1]:
# --- IMPORTS ---
import os
import re
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
import gensim
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize
nltk.download("punkt")

# --- CHEMINS ---
corpus_path = "../../data/sents.txt"      # Corpus complet CAMille
articles_dir = "../../data/txt_tp4/"      # Vos articles filtrés TP4

# --- FONCTIONS UTILITAIRES ---
def tokenize(text):
    """Tokenisation et nettoyage léger."""
    text = text.lower()
    text = re.sub(r"[^\w\s]", " ", text)
    text = re.sub(r"\s+", " ", text)
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if len(t) > 2]
    return tokens

# --- CHARGEMENT DES DONNÉES ---
# Corpus complet
with open(corpus_path, "r", encoding="utf-8") as f:
    all_texts = f.readlines()
all_texts = [t.strip() for t in all_texts if t.strip()]
print(f"Nombre de documents CAMille : {len(all_texts)}")

# Articles TP4
files = sorted([f for f in os.listdir(articles_dir) if f.endswith(".txt")])
tp4_texts = [open(os.path.join(articles_dir, f), "r", encoding="utf-8").read() for f in files]
print(f"Nombre d'articles TP4 : {len(tp4_texts)}")

# --- TOKENISATION ---
tokenized_all = [tokenize(t) for t in all_texts]
tokenized_tp4 = [tokenize(t) for t in tp4_texts]

# --- ENTRAÎNEMENT Word2Vec GLOBAL (tout CAMille) ---
model_global = Word2Vec(
    sentences=tokenized_all,
    vector_size=100,
    window=5,
    min_count=3,
    workers=4,
    sg=1  # skip-gram
)
model_global.save("w2v_global.model")
print("Modèle Word2Vec global entraîné.")

# --- OPTION : Word2Vec pour les articles TP4 ---
model_tp4 = Word2Vec(
    sentences=tokenized_tp4,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    sg=1
)
model_tp4.save("w2v_tp4.model")
print("Modèle Word2Vec TP4 entraîné.")

# --- EXPLORATION / VOISINS SEMANTIQUES ---
keywords = ["sorcière", "sorcellerie", "femme", "féminisme", "procès", "homme", "justice"]

def show_neighbors(word, model, topn=10):
    if word not in model.wv:
        print(f"'{word}' absent du vocabulaire.")
        return
    print(f"--- Voisins de '{word}' ---")
    for w, score in model.wv.most_similar(word, topn=topn):
        print(f"{w:20s}  {score:.3f}")

print("\nVoisins sémantiques - Modèle global")
for kw in keywords:
    show_neighbors(kw, model_global)

# --- COMPARAISON ENTRE WORDS ---
def compare_similarity(word1, word2, models_dict):
    print(f"\n### Similarité: {word1} ↔ {word2}")
    for name, model in models_dict.items():
        if word1 in model.wv and word2 in model.wv:
            sim = model.wv.similarity(word1, word2)
            print(f"{name}: {sim:.3f}")
        else:
            print(f"{name}: absent")

compare_similarity("sorcière", "femme", {"Global": model_global, "TP4": model_tp4})
compare_similarity("sorcière", "procès", {"Global": model_global, "TP4": model_tp4})
compare_similarity("femme", "homme", {"Global": model_global, "TP4": model_tp4})

# --- EXPORT DES VECTEURS POUR ANALYSES ---
EXPORT_DIR = "w2v_exports"
os.makedirs(EXPORT_DIR, exist_ok=True)

embeddings_export = []
for word in keywords:
    for model_name, model in {"Global": model_global, "TP4": model_tp4}.items():
        if word not in model.wv:
            continue
        vec = model.wv[word]
        embeddings_export.append({
            "model": model_name,
            "word": word,
            "vector": vec.tolist()
        })

with open(os.path.join(EXPORT_DIR, "embedding_vectors.json"), "w", encoding="utf-8") as f:
    json.dump(embeddings_export, f, indent=2, ensure_ascii=False)

print("Vecteurs embeddings exportés.")


[nltk_data] Downloading package punkt to /Users/ilaria/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Nombre de documents CAMille : 1000000
Nombre d'articles TP4 : 999


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_fl

Modèle Word2Vec global entraîné.


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_fl

Modèle Word2Vec TP4 entraîné.

Voisins sémantiques - Modèle global
--- Voisins de 'sorcière' ---
montagnarde           0.957
bégayait              0.954
rictus                0.953
convulsion            0.953
reportait             0.952
soubrette             0.950
pauvresse             0.948
duègne                0.948
méphistophélès        0.947
aliénée               0.947
--- Voisins de 'sorcellerie' ---
phénoménal            0.953
maffia                0.952
contestait            0.952
discutables           0.951
virulence             0.951
incarné               0.951
hanter                0.951
eqfin                 0.951
érotisme              0.950
halves                0.950
--- Voisins de 'femme' ---
mariée                0.751
femmo                 0.721
enfant                0.710
fillo                 0.695
fille                 0.680
fillette              0.679
blanchisseuse         0.678
orpheline             0.677
tille                 0.673
homme                 0.669
---

Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
