In [1]:
import pandas as pd
from pathlib import Path

# üìÅ D√©finition des chemins
PROJECT_ROOT = Path("..").resolve()
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
DATA_OUTPUTS = PROJECT_ROOT / "data" / "outputs"

# üìÑ Chargement du fichier nettoy√©
df = pd.read_csv(DATA_PROCESSED / "processed_data.csv")

# --- 1. Besoins textuels (long format) ---
besoin_text_cols = [
    "amelioration_pratiques",
    "besoins_aide_evaluation",
    "besoins_diversite_classe",
    "besoins_competence_numerique",
    "renforcement_collaboration",
    "preference_accompagnement",
    "besoin_dev_professionnel"
]

df_text = df[["enseignant_uid"] + besoin_text_cols].melt(
    id_vars="enseignant_uid",
    var_name="question_source",
    value_name="besoin_brut"
)
df_text["besoin_brut"] = df_text["besoin_brut"].astype(str).str.strip()
df_text = df_text[df_text["besoin_brut"] != ""].dropna()
df_text["categorie_besoin"] = None  # √† compl√©ter par classification

# --- 2. Besoins MLF (besoin fort/faible) ---
besoin_mlf_cols = [
    "besoin_formation_plurilingue",
    "besoin_formation_transversales",
    "besoin_formation_maternelle",
    "besoin_formation_orientation",
    "besoin_formation_stiam"
]

df_mlf = df[["enseignant_uid"] + besoin_mlf_cols].copy()
df_mlf = df_mlf.melt(id_vars="enseignant_uid", var_name="question_source", value_name="besoin_brut")

# On garde uniquement les r√©ponses ¬´ Besoin fort ¬ª
df_mlf = df_mlf[df_mlf["besoin_brut"] == "besoin fort"].copy()

# On cat√©gorise directement selon le th√®me MLF (extrait depuis le nom de colonne)
df_mlf["categorie_besoin"] = df_mlf["question_source"].str.replace("besoin_formation_", "", regex=False).str.lower()

# --- 3. Fusion finale ---
df_besoins = pd.concat([df_text, df_mlf], ignore_index=True)

# üîÅ R√©organisation des colonnes
df_besoins = df_besoins[["enseignant_uid", "question_source", "besoin_brut", "categorie_besoin"]]

# Export
df_besoins.to_csv(DATA_OUTPUTS / "df_besoins.csv", index=False)

# Aper√ßu
print(f"‚úÖ df_besoins global export√© avec {df_besoins.shape[0]} lignes")


‚úÖ df_besoins global export√© avec 866 lignes


In [3]:
df_besoins.head()

Unnamed: 0,enseignant_uid,question_source,besoin_brut,categorie_besoin
0,E0001,amelioration_pratiques,les outils informatiques et enseigner sur 3 ni...,
1,E0002,amelioration_pratiques,"ma capacit√© √† enseigner dans une autre langue,...",
2,E0003,amelioration_pratiques,enrichir le contenu dans certaines disciplines...,
3,E0004,amelioration_pratiques,"les neurosciences m'inspirent de plus en plus,...",
4,E0005,amelioration_pratiques,la diff√©renciation est un point que j'aimerais...,


In [4]:
from sentence_transformers import SentenceTransformer
import os

if os.path.exists("../models/paraphrase_multilingual"):
    model = SentenceTransformer("../models/paraphrase_multilingual")
else:
    model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
    model.save("../models/paraphrase_multilingual")


In [None]:
df_text.head(100)

# APPROCHE TOPIC MODELING

In [None]:
# --- 1. Imports
import os
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
import spacy
from spacy.lang.fr.stop_words import STOP_WORDS

# --- 2. Configuration (supprime le warning tokenizers)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# --- 3. Charger mod√®le spaCy fran√ßais
nlp = spacy.load("fr_core_news_md")

# Ajouter des mots personnalis√©s √† la liste de stopwords
custom_stopwords = {
    "de", "des", "et", "les", "la", "le", "en", "un", "une", "du", "au", "aux",
    "pour", "par", "avec", "dans", "sur", "sous", "entre", "chez", "comme",
    "ou", "mais", "donc", "or", "ni", "car", "que", "qu", "qui", "quoi", "dont",
    "cela", "ce", "cette", "ces", "√ßa", "c'", "il", "elle", "ils", "elles", "on",
    "je", "tu", "nous", "vous", "leur", "leurs", "mon", "ton", "son", "ma", "ta",
    "sa", "mes", "tes", "ses", "nos", "vos", "√™tre", "avoir", "fait", "faire",
    "plus", "moins", "tr√®s", "trop", "encore", "d√©j√†", "aussi", "autre", "autres",
    "peu", "beaucoup", "jamais", "toujours", "souvent", "rarement", "fois",
    "ex", "exemple", "type", "via", "afin", "lors", "pendant", "vers", "ainsi",
    "donc", "alors", "bien", "mal", "d√®s", "tout", "tous", "toutes", "aucun",
    "aucune", "chaque", "certains", "certaines", "aucuns", "diff√©rents",
    "divers", "quelque", "quelques", "quel", "quelle", "quels", "quelles",
    "tel", "telle", "tels", "telles", "ex.", "etc","tsa","ai",'√©l√®ve',
}

# Ajout √† la liste spaCy (global)
STOP_WORDS |= custom_stopwords

# --- 4. Lemmatisation + nettoyage
def preprocess_text(text):
    doc = nlp(text.lower())
    lemmes = [
        token.lemma_
        for token in doc
        if token.lemma_ not in STOP_WORDS and not token.is_punct and not token.is_space
    ]
    return " ".join(lemmes)

# --- 5. Chargement et nettoyage de df_besoins
df_text_only = df_besoins[df_besoins["categorie_besoin"].isna()].copy()
df_text_only["besoin_brut"] = df_text_only["besoin_brut"].astype(str).str.strip()
df_text_only = df_text_only[df_text_only["besoin_brut"].str.len() > 5]

# --- 6. Lemmatisation
besoins_textuels = df_text_only["besoin_brut"].tolist()
besoins_lemmatise = [preprocess_text(txt) for txt in besoins_textuels]

# --- 7. Embedding + Vectorizer
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

# Pas besoin de stopwords suppl√©mentaires : spaCy les a d√©j√† supprim√©s
vectorizer_model = CountVectorizer()

# --- 8. Initialisation du mod√®le BERTopic
topic_model = BERTopic(
    embedding_model=embedding_model,
    language="french",
    vectorizer_model=vectorizer_model
)

# --- 9. Entra√Ænement
topics, probs = topic_model.fit_transform(besoins_lemmatise)

# --- 10. Affichage r√©sum√© des topics
df_topics = topic_model.get_topic_info()
print(df_topics.head(10))

# --- 11. Visualisation (si topics d√©tect√©s)
if len(df_topics) > 2:
    topic_model.visualize_barchart(top_n_topics=10).show()
else:
    print("‚ùó Pas assez de topics pour visualiser.")

# --- 12. Fusion des r√©sultats
df_resultats = pd.DataFrame({
    "enseignant_uid": df_text_only["enseignant_uid"].tolist(),
    "question_source": df_text_only["question_source"].tolist(),
    "besoin_brut": besoins_textuels,
    "besoin_lemmatise": besoins_lemmatise,
    "topic": topics
})


   Topic  Count                                        Name  \
0     -1    153            -1_formation_√©l√®ve_besoin_classe   
1      0     90            0_√©l√®ve_p√©dagogique_aimer_classe   
2      1     75        1_num√©rique_outil_√©l√®ve_intelligence   
3      2     51  2_temps_travail_concertation_collaboration   
4      3     45               3_utilisation_ia_besoin_point   
5      4     40     4_√©quipe_coll√®gue_collaboration_√©change   
6      5     21       5_√©valuation_√©l√®ve_besoin_diff√©renci√©   
7      6     18             6_anglais_langue_langage_sortie   
8      7     17        7_formation_pr√©sentiel_master_module   
9      8     14                 8_cycle_inter_science_√©crit   

                                      Representation  \
0  [formation, √©l√®ve, besoin, classe, apprentissa...   
1  [√©l√®ve, p√©dagogique, aimer, classe, gestion, b...   
2  [num√©rique, outil, √©l√®ve, intelligence, artifi...   
3  [temps, travail, concertation, collaboration, ...   


In [27]:
df_topic_minus1 = df_resultats[df_resultats["topic"] == -1]


from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN

besoins_brut_topic_minus1 = df_topic_minus1["besoin_brut"].tolist()

# Embedding
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
embeddings = embedding_model.encode(besoins_brut_topic_minus1, show_progress_bar=True)

# Mod√®le plus tol√©rant
sub_topic_model = BERTopic(
    embedding_model=embedding_model,
    language="french",
    hdbscan_model=HDBSCAN(min_cluster_size=3, min_samples=1)
)

sub_topics, _ = sub_topic_model.fit_transform(besoins_brut_topic_minus1, embeddings)


df_sub_topics = sub_topic_model.get_topic_info()
print(df_sub_topics.head(10))  # pour voir les 10 premiers


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

   Topic  Count                                   Name  \
0     -1      9      -1_autres_chimie_physique_projets   
1      0     11        0_diversit√©_de_culturelle_leurs   
2      1      8     1_scratch_concr√®te_r√©p√©t√©e_groupes   
3      2      8         2_arabe_langue_arabophones_non   
4      3      8  3_exp√©rience_pr√©sentiel_ces_formation   
5      4      7  4_neurosciences_former_sciences_neuro   
6      5      6                      5_book_and_the_to   
7      6      6          6_elea_accompagnement_ebep_ia   
8      7      6       7_grilles_√©quipe_supports_besoin   
9      8      6                  8_ex_mener_pas_outils   

                                      Representation  \
0  [autres, chimie, physique, projets, un, davant...   
1  [diversit√©, de, culturelle, leurs, la, et, bes...   
2  [scratch, concr√®te, r√©p√©t√©e, groupes, ensuite,...   
3  [arabe, langue, arabophones, non, √©l√®ves, pap,...   
4  [exp√©rience, pr√©sentiel, ces, formation, forma...   
5  [neur

In [28]:
import pandas as pd
from pathlib import Path

# Chemins
PROJECT_ROOT = Path("..").resolve()
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
DATA_OUTPUTS = PROJECT_ROOT / "data" / "outputs"

# Chargement
df = pd.read_csv(DATA_PROCESSED / "processed_data.csv")

# R√©ponses ouvertes
besoin_text_cols = [
    "amelioration_pratiques", "besoins_aide_evaluation", "besoins_diversite_classe",
    "besoins_competence_numerique", "renforcement_collaboration",
    "preference_accompagnement", "besoin_dev_professionnel"
]

df_text = df[["enseignant_uid"] + besoin_text_cols].melt(
    id_vars="enseignant_uid", var_name="question_source", value_name="besoin_brut"
)
df_text["besoin_brut"] = df_text["besoin_brut"].astype(str).str.strip()
df_text = df_text[df_text["besoin_brut"] != ""]
df_text["categorie_besoin"] = None

# R√©ponses "besoin fort" (questions ferm√©es MLF)
besoin_mlf_cols = [
    "besoin_formation_plurilingue", "besoin_formation_transversales",
    "besoin_formation_maternelle", "besoin_formation_orientation", "besoin_formation_stiam"
]

df_mlf = df[["enseignant_uid"] + besoin_mlf_cols].melt(
    id_vars="enseignant_uid", var_name="question_source", value_name="besoin_brut"
)
df_mlf = df_mlf[df_mlf["besoin_brut"] == "besoin fort"]
df_mlf["categorie_besoin"] = df_mlf["question_source"].str.replace("besoin_formation_", "", regex=False).str.lower()

# Fusion
df_besoins = pd.concat([df_text, df_mlf], ignore_index=True)
df_besoins.to_csv(DATA_OUTPUTS / "df_besoins.csv", index=False)


In [29]:
import spacy
from spacy.lang.fr.stop_words import STOP_WORDS

# Stopwords personnalis√©s
# Ajouter des mots personnalis√©s √† la liste de stopwords
custom_stopwords = {
    "de", "des", "et", "les", "la", "le", "en", "un", "une", "du", "au", "aux",
    "pour", "par", "avec", "dans", "sur", "sous", "entre", "chez", "comme",
    "ou", "mais", "donc", "or", "ni", "car", "que", "qu", "qui", "quoi", "dont",
    "cela", "ce", "cette", "ces", "√ßa", "c'", "il", "elle", "ils", "elles", "on",
    "je", "tu", "nous", "vous", "leur", "leurs", "mon", "ton", "son", "ma", "ta",
    "sa", "mes", "tes", "ses", "nos", "vos", "√™tre", "avoir", "fait", "faire",
    "plus", "moins", "tr√®s", "trop", "encore", "d√©j√†", "aussi", "autre", "autres",
    "peu", "beaucoup", "jamais", "toujours", "souvent", "rarement", "fois",
    "ex", "exemple", "type", "via", "afin", "lors", "pendant", "vers", "ainsi",
    "donc", "alors", "bien", "mal", "d√®s", "tout", "tous", "toutes", "aucun",
    "aucune", "chaque", "certains", "certaines", "aucuns", "diff√©rents",
    "divers", "quelque", "quelques", "quel", "quelle", "quels", "quelles",
    "tel", "telle", "tels", "telles", "ex.", "etc","tsa","ai",'√©l√®ve',
}

# Ajout √† la liste spaCy (global)
STOP_WORDS |= custom_stopwords

nlp = spacy.load("fr_core_news_md")

def preprocess_text(text):
    doc = nlp(text.lower())
    return " ".join([
        token.lemma_
        for token in doc
        if token.lemma_ not in STOP_WORDS and not token.is_punct and not token.is_space
    ])


In [30]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

df_text_only = df_besoins[df_besoins["categorie_besoin"].isna()].copy()
df_text_only = df_text_only[df_text_only["besoin_brut"].str.len() > 5]

besoins_brut = df_text_only["besoin_brut"].tolist()
besoins_lemmatise = [preprocess_text(txt) for txt in besoins_brut]

embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
vectorizer_model = CountVectorizer()

topic_model = BERTopic(
    embedding_model=embedding_model,
    language="french",
    vectorizer_model=vectorizer_model
)

topics, _ = topic_model.fit_transform(besoins_lemmatise)

df_resultats = pd.DataFrame({
    "enseignant_uid": df_text_only["enseignant_uid"].tolist(),
    "question_source": df_text_only["question_source"].tolist(),
    "besoin_brut": besoins_brut,
    "besoin_lemmatise": besoins_lemmatise,
    "topic": topics
})


In [31]:
from hdbscan import HDBSCAN

df_topic_minus1 = df_resultats[df_resultats["topic"] == -1].copy()
besoins_minus1 = df_topic_minus1["besoin_brut"].tolist()
embeddings_minus1 = embedding_model.encode(besoins_minus1)

sub_topic_model = BERTopic(
    embedding_model=embedding_model,
    language="french",
    hdbscan_model=HDBSCAN(min_cluster_size=3, min_samples=1)
)

sub_topics, _ = sub_topic_model.fit_transform(besoins_minus1, embeddings_minus1)

df_topic_minus1["sub_topic"] = sub_topics
topic_offset = topic_model.get_topic_info()["Topic"].max() + 1
df_topic_minus1["topic"] = df_topic_minus1["sub_topic"].apply(lambda x: x + topic_offset if x != -1 else -1)


In [34]:
df_resultats_main = df_resultats[df_resultats["topic"] != -1]
df_resultats_sub = df_topic_minus1[df_topic_minus1["topic"] != -1]

df_resultats_all = pd.concat([df_resultats_main, df_resultats_sub], ignore_index=True)

df_profils = df_resultats_all.groupby("enseignant_uid")["topic"].unique().reset_index()
df_profils.columns = ["enseignant_uid", "topics_exprimes"]

df_profils = df_profils.merge(
    df[["enseignant_uid", "etablissement"]],
    on="enseignant_uid",
    how="left"
)


In [35]:
df_topics_main = topic_model.get_topic_info().query("Topic != -1")
df_topics_sub = sub_topic_model.get_topic_info().query("Topic != -1")
df_topics_sub["Topic"] = df_topics_sub["Topic"] + topic_offset

df_themes = pd.concat([
    df_topics_main[["Topic", "Name", "Count", "Representative_Docs"]],
    df_topics_sub[["Topic", "Name", "Count", "Representative_Docs"]]
], ignore_index=True)

df_themes.columns = ["theme_id", "theme_nom", "nb_reponses", "exemples"]


In [None]:
df_topics_main = topic_model.get_topic_info().query("Topic != -1")
df_topics_sub = sub_topic_model.get_topic_info().query("Topic != -1")
df_topics_sub["Topic"] = df_topics_sub["Topic"] + topic_offset

df_themes = pd.concat([
    df_topics_main[["Topic", "Name", "Count", "Representative_Docs"]],
    df_topics_sub[["Topic", "Name", "Count", "Representative_Docs"]]
], ignore_index=True)

df_themes.columns = ["theme_id", "theme_nom", "nb_reponses", "exemples"]


In [38]:
from pathlib import Path

# Chemins relatifs depuis un notebook dans `notebooks/`
PROJECT_ROOT = Path("..").resolve()
DATA_OUTPUTS = PROJECT_ROOT / "data" / "outputs"

# Sauvegardes
df_themes.to_csv(DATA_OUTPUTS / "plan_formation_themes.csv", index=False)
df_resultats_all.to_csv(DATA_OUTPUTS / "reponses_par_topic.csv", index=False)
df_profils.to_csv(DATA_OUTPUTS / "enseignants_par_theme.csv", index=False)
